aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile7
-rw-r--r--arch/x86/crypto/ablk_helper.c149
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2811
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c149
-rw-r--r--arch/x86/crypto/blowfish_glue.c3
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c2
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c2
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c5
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S33
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c20
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c2
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c2
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c2
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S708
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c53
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c4
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c2
19 files changed, 3744 insertions, 214 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 7d6ba9db1be..61d6e281898 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -3,8 +3,9 @@
#
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
-obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
@@ -75,8 +76,12 @@ ifeq ($(avx2_supported),yes)
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+ifeq ($(avx2_supported),yes)
+sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+endif
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c
deleted file mode 100644
index 43282fe04a8..00000000000
--- a/arch/x86/crypto/ablk_helper.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Shared async block cipher helpers
- *
- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * Based on aesni-intel_glue.c by:
- * Copyright (C) 2008, Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <crypto/algapi.h>
-#include <crypto/cryptd.h>
-#include <asm/i387.h>
-#include <asm/crypto/ablk_helper.h>
-
-int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
- unsigned int key_len)
-{
- struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
- int err;
-
- crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
- & CRYPTO_TFM_REQ_MASK);
- err = crypto_ablkcipher_setkey(child, key, key_len);
- crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
- & CRYPTO_TFM_RES_MASK);
- return err;
-}
-EXPORT_SYMBOL_GPL(ablk_set_key);
-
-int __ablk_encrypt(struct ablkcipher_request *req)
-{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- struct blkcipher_desc desc;
-
- desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
- desc.info = req->info;
- desc.flags = 0;
-
- return crypto_blkcipher_crt(desc.tfm)->encrypt(
- &desc, req->dst, req->src, req->nbytes);
-}
-EXPORT_SYMBOL_GPL(__ablk_encrypt);
-
-int ablk_encrypt(struct ablkcipher_request *req)
-{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
- if (!irq_fpu_usable()) {
- struct ablkcipher_request *cryptd_req =
- ablkcipher_request_ctx(req);
-
- memcpy(cryptd_req, req, sizeof(*req));
- ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-
- return crypto_ablkcipher_encrypt(cryptd_req);
- } else {
- return __ablk_encrypt(req);
- }
-}
-EXPORT_SYMBOL_GPL(ablk_encrypt);
-
-int ablk_decrypt(struct ablkcipher_request *req)
-{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
- if (!irq_fpu_usable()) {
- struct ablkcipher_request *cryptd_req =
- ablkcipher_request_ctx(req);
-
- memcpy(cryptd_req, req, sizeof(*req));
- ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-
- return crypto_ablkcipher_decrypt(cryptd_req);
- } else {
- struct blkcipher_desc desc;
-
- desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
- desc.info = req->info;
- desc.flags = 0;
-
- return crypto_blkcipher_crt(desc.tfm)->decrypt(
- &desc, req->dst, req->src, req->nbytes);
- }
-}
-EXPORT_SYMBOL_GPL(ablk_decrypt);
-
-void ablk_exit(struct crypto_tfm *tfm)
-{
- struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
-
- cryptd_free_ablkcipher(ctx->cryptd_tfm);
-}
-EXPORT_SYMBOL_GPL(ablk_exit);
-
-int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
-{
- struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
- struct cryptd_ablkcipher *cryptd_tfm;
-
- cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- ctx->cryptd_tfm = cryptd_tfm;
- tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
- crypto_ablkcipher_reqsize(&cryptd_tfm->base);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ablk_init_common);
-
-int ablk_init(struct crypto_tfm *tfm)
-{
- char drv_name[CRYPTO_MAX_ALG_NAME];
-
- snprintf(drv_name, sizeof(drv_name), "__driver-%s",
- crypto_tfm_alg_driver_name(tfm));
-
- return ablk_init_common(tfm, drv_name);
-}
-EXPORT_SYMBOL_GPL(ablk_init);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 00000000000..522ab68d1c8
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2811 @@
+########################################################################
+# Copyright (c) 2013, Intel Corporation
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
+# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+##
+## Authors:
+## Erdinc Ozturk <erdinc.ozturk@intel.com>
+## Vinodh Gopal <vinodh.gopal@intel.com>
+## James Guilford <james.guilford@intel.com>
+## Tim Chen <tim.c.chen@linux.intel.com>
+##
+## References:
+## This code was derived and highly optimized from the code described in paper:
+## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
+## on Intel Architecture Processors. August, 2010
+## The details of the implementation is explained in:
+## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
+## on Intel Architecture Processors. October, 2012.
+##
+## Assumptions:
+##
+##
+##
+## iv:
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Salt (From the SA) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Initialization Vector |
+## | (This is the sequence number from IPSec header) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x1 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##
+##
+## AAD:
+## AAD padded to 128 bits with 0
+## for example, assume AAD is a u32 vector
+##
+## if AAD is 8 bytes:
+## AAD[3] = {A0, A1}#
+## padded AAD in xmm register = {A1 A0 0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A1) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 32-bit Sequence Number (A0) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 32-bit Sequence Number
+##
+## if AAD is 12 bytes:
+## AAD[3] = {A0, A1, A2}#
+## padded AAD in xmm register = {A2 A1 A0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A2) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 64-bit Extended Sequence Number {A1,A0} |
+## | |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 64-bit Extended Sequence Number
+##
+##
+## aadLen:
+## from the definition of the spec, aadLen can only be 8 or 12 bytes.
+## The code additionally supports aadLen of length 16 bytes.
+##
+## TLen:
+## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+##
+## poly = x^128 + x^127 + x^126 + x^121 + 1
+## throughout the code, one tab and two tab indentations are used. one tab is
+## for GHASH part, two tabs is for AES part.
+##
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.data
+.align 16
+
+POLY: .octa 0xC2000000000000000000000000000001
+POLY2: .octa 0xC20000000000000000000001C2000000
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ZERO: .octa 0x00000000000000000000000000000000
+ONE: .octa 0x00000000000000000000000000000001
+ONEf: .octa 0x01000000000000000000000000000000
+
+.text
+
+
+##define the fields of the gcm aes context
+#{
+# u8 expanded_keys[16*11] store expanded keys
+# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
+# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
+# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
+# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
+# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
+# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
+# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
+# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
+# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+#} gcm_ctx#
+
+HashKey = 16*11 # store HashKey <<1 mod poly here
+HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
+HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
+HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
+HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
+HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
+HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
+HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
+HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+#define arg4 %rcx
+#define arg5 %r8
+#define arg6 %r9
+#define arg7 STACK_OFFSET+8*1(%r14)
+#define arg8 STACK_OFFSET+8*2(%r14)
+#define arg9 STACK_OFFSET+8*3(%r14)
+
+i = 0
+j = 0
+
+out_order = 0
+in_order = 1
+DEC = 0
+ENC = 1
+
+.macro define_reg r n
+reg_\r = %xmm\n
+.endm
+
+.macro setreg
+.altmacro
+define_reg i %i
+define_reg j %j
+.noaltmacro
+.endm
+
+# need to push 4 registers into stack to maintain
+STACK_OFFSET = 8*4
+
+TMP1 = 16*0 # Temporary storage for AAD
+TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+TMP3 = 16*2 # Temporary storage for AES State 3
+TMP4 = 16*3 # Temporary storage for AES State 4
+TMP5 = 16*4 # Temporary storage for AES State 5
+TMP6 = 16*5 # Temporary storage for AES State 6
+TMP7 = 16*6 # Temporary storage for AES State 7
+TMP8 = 16*7 # Temporary storage for AES State 8
+
+VARIABLE_OFFSET = 16*8
+
+################################
+# Utility Macros
+################################
+
+# Encryption of a single block
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+ vpxor (arg1), \XMM0, \XMM0
+ i = 1
+ setreg
+.rep 9
+ vaesenc 16*i(arg1), \XMM0, \XMM0
+ i = (i+1)
+ setreg
+.endr
+ vaesenclast 16*10(arg1), \XMM0, \XMM0
+.endm
+
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+
+ vpshufd $0b01001110, \GH, \T2
+ vpshufd $0b01001110, \HK, \T3
+ vpxor \GH , \T2, \T2 # T2 = (a1+a0)
+ vpxor \HK , \T3, \T3 # T3 = (b1+b0)
+
+ vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
+ vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
+ vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
+ vpxor \GH, \T2,\T2
+ vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
+
+ vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
+ vpxor \T3, \GH, \GH
+ vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
+
+ #first phase of the reduction
+ vpslld $31, \GH, \T2 # packed right shifting << 31
+ vpslld $30, \GH, \T3 # packed right shifting shift << 30
+ vpslld $25, \GH, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+
+ #second phase of the reduction
+
+ vpsrld $1,\GH, \T2 # packed left shifting >> 1
+ vpsrld $2,\GH, \T3 # packed left shifting >> 2
+ vpsrld $7,\GH, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T5, \T2, \T2
+ vpxor \T2, \GH, \GH
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_2_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqa \T5, HashKey_3(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_3_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqa \T5, HashKey_4(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_4_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqa \T5, HashKey_5(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_5_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqa \T5, HashKey_6(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_6_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqa \T5, HashKey_7(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_7_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqa \T5, HashKey_8(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_8_k(arg1)
+
+.endm
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+ i = (8-\num_initial_blocks)
+ setreg
+
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_i, reg_i, reg_i
+_get_AAD_loop\@:
+ vmovd (%r10), \T1
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+
+ add $4, %r10
+ sub $4, %r12
+ jg _get_AAD_loop\@
+
+
+ cmp $16, %r11
+ je _get_AAD_loop2_done\@
+ mov $16, %r12
+
+_get_AAD_loop2\@:
+ vpsrldq $4, reg_i, reg_i
+ sub $4, %r12
+ cmp %r11, %r12
+ jg _get_AAD_loop2\@
+
+_get_AAD_loop2_done\@:
+
+ #byte-reflect the AAD data
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+
+ # initialize the data pointer offset as zero
+ xor %r11, %r11
+
+ # start AES for num_initial_blocks blocks
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), \CTR # CTR = Y0
+ vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep 9
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*10(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+ GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep 9 # do 9 rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg3, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg3, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg3, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg3, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg3, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg3, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg3, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+.endm
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+
+ vpshufd $0b01001110, \T2, \T6
+ vpxor \T2, \T6, \T6
+
+ vmovdqa HashKey_8_k(arg1), \T5
+ vpclmulqdq $0x00, \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_7_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_6_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_5_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_4_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_3_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_2_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqa HashKey(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vpxor \T4, \T6, \T6
+ vpxor \T7, \T6, \T6
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg3, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg3, %r11), reg_j
+ vmovdqu \T3, 16*i(arg2, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ #######################################################################
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ .endif
+
+ #######################################################################
+ #second phase of the reduction
+ vpsrld $1, \T7, \T2 # packed left shifting >> 1
+ vpsrld $2, \T7, \T3 # packed left shifting >> 2
+ vpsrld $7, \T7, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T1, \T2, \T2
+ vpxor \T2, \T7, \T7
+ vpxor \T7, \T6, \T6 # the result is in T6
+ #######################################################################
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+
+ vpxor \T6, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+ ## Karatsuba Method
+
+
+ vpshufd $0b01001110, \XMM1, \T2
+ vpxor \XMM1, \T2, \T2
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM1, \T6
+ vpclmulqdq $0x00, \T5, \XMM1, \T7
+
+ vmovdqa HashKey_8_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM2, \T2
+ vpxor \XMM2, \T2, \T2
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM2, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM2, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_7_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM3, \T2
+ vpxor \XMM3, \T2, \T2
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM3, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM3, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_6_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM4, \T2
+ vpxor \XMM4, \T2, \T2
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM4, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM4, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_5_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM5, \T2
+ vpxor \XMM5, \T2, \T2
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM5, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM5, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_4_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM6, \T2
+ vpxor \XMM6, \T2, \T2
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM6, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM6, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_3_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM7, \T2
+ vpxor \XMM7, \T2, \T2
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM7, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM7, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_2_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM8, \T2
+ vpxor \XMM8, \T2, \T2
+ vmovdqa HashKey(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM8, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM8, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+ vpxor \T6, \XMM1, \XMM1
+ vpxor \T7, \XMM1, \T2
+
+
+
+
+ vpslldq $8, \T2, \T4
+ vpsrldq $8, \T2, \T2
+
+ vpxor \T4, \T7, \T7
+ vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
+ # the accumulated carry-less multiplications
+
+ #######################################################################
+ #first phase of the reduction
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+
+
+ #second phase of the reduction
+ vpsrld $1, \T7, \T2 # packed left shifting >> 1
+ vpsrld $2, \T7, \T3 # packed left shifting >> 2
+ vpsrld $7, \T7, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T1, \T2, \T2
+ vpxor \T2, \T7, \T7
+ vpxor \T7, \T6, \T6 # the result is in T6
+
+.endm
+
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC_AVX ENC_DEC
+
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+
+ vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
+
+ mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ cmp $0, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ cmp $16, arg4
+ jl _only_less_than_16\@
+
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+ sub $16, %r11
+ add %r13, %r11
+ vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
+ jmp _final_ghash_mul\@
+
+_only_less_than_16\@:
+ # check for 0 length
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+_get_last_16_byte_loop\@:
+ movb (arg3, %r11), %al
+ movb %al, TMP1 (%rsp , %r11)
+ add $1, %r11
+ cmp %r13, %r11
+ jne _get_last_16_byte_loop\@
+
+ vmovdqu TMP1(%rsp), %xmm1
+
+ sub $16, %r11
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg2 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg2 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+ mov arg7, %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ shl $3, arg4 # len(C) in bits (*128)
+ vmovq arg4, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), %xmm9 # xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov arg8, %r10 # r10 = authTag
+ mov arg9, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $12, %r11
+ je _T_12\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ jmp _return_T_done\@
+_T_12\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ vpsrldq $8, %xmm9, %xmm9
+ vmovd %xmm9, %eax
+ mov %eax, 8(%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+
+#############################################################
+#void aesni_gcm_precomp_avx_gen2
+# (gcm_data *my_ctx_data,
+# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#############################################################
+ENTRY(aesni_gcm_precomp_avx_gen2)
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+ vmovdqu (arg2), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
+
+
+ PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+ENDPROC(aesni_gcm_precomp_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_enc_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_enc_avx_gen2)
+ GCM_ENC_DEC_AVX ENC
+ ret
+ENDPROC(aesni_gcm_enc_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_dec_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_dec_avx_gen2)
+ GCM_ENC_DEC_AVX DEC
+ ret
+ENDPROC(aesni_gcm_dec_avx_gen2)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
+
+ vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
+ vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
+ vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
+ vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
+ vpxor \T3, \GH, \GH
+
+
+ vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
+ vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
+
+ vpxor \T3, \T1, \T1
+ vpxor \T2, \GH, \GH
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \GH, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
+
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+ #######################################################################
+ #second phase of the reduction
+ vpclmulqdq $0x00, \GH, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \GH, \T3, \GH
+ vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \GH, \GH # second phase of the reduction complete
+ #######################################################################
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqa \T5, HashKey_3(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqa \T5, HashKey_4(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqa \T5, HashKey_5(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqa \T5, HashKey_6(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqa \T5, HashKey_7(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqa \T5, HashKey_8(arg1)
+
+.endm
+
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+ i = (8-\num_initial_blocks)
+ setreg
+
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_i, reg_i, reg_i
+_get_AAD_loop\@:
+ vmovd (%r10), \T1
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+
+ add $4, %r10
+ sub $4, %r12
+ jg _get_AAD_loop\@
+
+
+ cmp $16, %r11
+ je _get_AAD_loop2_done\@
+ mov $16, %r12
+
+_get_AAD_loop2\@:
+ vpsrldq $4, reg_i, reg_i
+ sub $4, %r12
+ cmp %r11, %r12
+ jg _get_AAD_loop2\@
+
+_get_AAD_loop2_done\@:
+
+ #byte-reflect the AAD data
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+
+ # initialize the data pointer offset as zero
+ xor %r11, %r11
+
+ # start AES for num_initial_blocks blocks
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), \CTR # CTR = Y0
+ vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep 9
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*10(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
+ # num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+ GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep 9 # do 9 rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg3, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg3, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg3, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg3, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg3, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg3, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg3, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
+ # the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+
+.endm
+
+
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+ vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
+ vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
+ vpxor \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqa HashKey(arg1), \T5
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T1
+
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg3, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg3, %r11), reg_j
+ vmovdqu \T3, 16*i(arg2, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \T7, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
+
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ .endif
+
+ #######################################################################
+ #second phase of the reduction
+ vpclmulqdq $0x00, \T7, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \T7, \T3, \T4
+ vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \T4, \T4 # second phase of the reduction complete
+ #######################################################################
+ vpxor \T4, \T1, \T1 # the result is in T1
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+
+ vpxor \T1, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+ ## Karatsuba Method
+
+ vmovdqa HashKey_8(arg1), \T5
+
+ vpshufd $0b01001110, \XMM1, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM1, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM1, \T6
+ vpclmulqdq $0x00, \T5, \XMM1, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_7(arg1), \T5
+ vpshufd $0b01001110, \XMM2, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM2, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM2, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM2, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_6(arg1), \T5
+ vpshufd $0b01001110, \XMM3, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM3, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM3, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM3, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_5(arg1), \T5
+ vpshufd $0b01001110, \XMM4, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM4, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM4, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM4, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_4(arg1), \T5
+ vpshufd $0b01001110, \XMM5, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM5, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM5, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM5, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_3(arg1), \T5
+ vpshufd $0b01001110, \XMM6, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM6, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM6, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM6, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_2(arg1), \T5
+ vpshufd $0b01001110, \XMM7, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM7, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM7, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM7, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey(arg1), \T5
+ vpshufd $0b01001110, \XMM8, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM8, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM8, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM8, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+ vpxor \T6, \XMM1, \XMM1
+ vpxor \T7, \XMM1, \T2
+
+
+
+
+ vpslldq $8, \T2, \T4
+ vpsrldq $8, \T2, \T2
+
+ vpxor \T4, \T7, \T7
+ vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
+ # accumulated carry-less multiplications
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \T7, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
+
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+
+
+ #second phase of the reduction
+ vpclmulqdq $0x00, \T7, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \T7, \T3, \T4
+ vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \T4, \T4 # second phase of the reduction complete
+ #######################################################################
+ vpxor \T4, \T6, \T6 # the result is in T6
+.endm
+
+
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC_AVX2 ENC_DEC
+
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+
+ vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
+
+ mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ cmp $0, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ cmp $16, arg4
+ jl _only_less_than_16\@
+
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+ sub $16, %r11
+ add %r13, %r11
+ vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer
+ # to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
+ jmp _final_ghash_mul\@
+
+_only_less_than_16\@:
+ # check for 0 length
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+_get_last_16_byte_loop\@:
+ movb (arg3, %r11), %al
+ movb %al, TMP1 (%rsp , %r11)
+ add $1, %r11
+ cmp %r13, %r11
+ jne _get_last_16_byte_loop\@
+
+ vmovdqu TMP1(%rsp), %xmm1
+
+ sub $16, %r11
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg2 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg2 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+ mov arg7, %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ shl $3, arg4 # len(C) in bits (*128)
+ vmovq arg4, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), %xmm9 # xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov arg8, %r10 # r10 = authTag
+ mov arg9, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $12, %r11
+ je _T_12\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ jmp _return_T_done\@
+_T_12\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ vpsrldq $8, %xmm9, %xmm9
+ vmovd %xmm9, %eax
+ mov %eax, 8(%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+
+#############################################################
+#void aesni_gcm_precomp_avx_gen4
+# (gcm_data *my_ctx_data,
+# u8 *hash_subkey)# /* H, the Hash sub key input.
+# Data starts on a 16-byte boundary. */
+#############################################################
+ENTRY(aesni_gcm_precomp_avx_gen4)
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+ vmovdqu (arg2), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
+
+
+ PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+ENDPROC(aesni_gcm_precomp_avx_gen4)
+
+
+###############################################################################
+#void aesni_gcm_enc_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_enc_avx_gen4)
+ GCM_ENC_DEC_AVX2 ENC
+ ret
+ENDPROC(aesni_gcm_enc_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_dec_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_dec_avx_gen4)
+ GCM_ENC_DEC_AVX2 DEC
+ ret
+ENDPROC(aesni_gcm_dec_avx_gen4)
+
+#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index f80e668785c..948ad0e7774 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -34,7 +34,7 @@
#include <asm/cpu_device_id.h>
#include <asm/i387.h>
#include <asm/crypto/aes.h>
-#include <asm/crypto/ablk_helper.h>
+#include <crypto/ablk_helper.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
#include <linux/workqueue.h>
@@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
int crypto_fpu_init(void);
void crypto_fpu_exit(void);
+#define AVX_GEN2_OPTSIZE 640
+#define AVX_GEN4_OPTSIZE 4096
+
#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
+
+#ifdef CONFIG_AS_AVX
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ if (plaintext_len < AVX_GEN2_OPTSIZE) {
+ aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+
+static void aesni_gcm_dec_avx(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+ aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+#endif
+
+#ifdef CONFIG_AS_AVX2
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ if (plaintext_len < AVX_GEN2_OPTSIZE) {
+ aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+ aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+
+static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+ aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
+ aad, aad_len, auth_tag, auth_tag_len);
+ } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+ aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+#endif
+
+static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
@@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
dst = src;
}
- aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+ aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+ ((unsigned long)req->cryptlen), auth_tag_len);
@@ -996,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
dst = src;
}
- aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+ aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
authTag, auth_tag_len);
/* Compare generated tag with passed in tag. */
- retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+ retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
-EBADMSG : 0;
if (one_entry_in_sg) {
@@ -1353,6 +1473,27 @@ static int __init aesni_init(void)
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_AS_AVX2
+ if (boot_cpu_has(X86_FEATURE_AVX2)) {
+ pr_info("AVX2 version of gcm_enc/dec engaged.\n");
+ aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
+ aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+ } else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (boot_cpu_has(X86_FEATURE_AVX)) {
+ pr_info("AVX version of gcm_enc/dec engaged.\n");
+ aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
+ aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+ } else
+#endif
+ {
+ pr_info("SSE version of gcm_enc/dec engaged.\n");
+ aesni_gcm_enc_tfm = aesni_gcm_enc;
+ aesni_gcm_dec_tfm = aesni_gcm_dec;
+ }
+#endif
err = crypto_fpu_init();
if (err)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 50ec333b70e..8af519ed73d 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -223,9 +223,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
src -= 1;
dst -= 1;
} while (nbytes >= bsize * 4);
-
- if (nbytes < bsize)
- goto done;
}
/* Handle leftovers */
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 414fe5d7946..4209a76fcda 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/ctr.h>
#include <crypto/lrw.h>
@@ -21,7 +22,6 @@
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/camellia.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 37fd0c0a81e..87a041a10f4 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/ctr.h>
#include <crypto/lrw.h>
@@ -21,7 +22,6 @@
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/camellia.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index c6631813dc1..e57e20ab5e0 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -26,13 +26,13 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/cast5.h>
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#define CAST5_PARALLEL_BLOCKS 16
@@ -203,9 +203,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
src -= 1;
dst -= 1;
} while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
-
- if (nbytes < bsize)
- goto done;
}
/* Handle leftovers */
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 8d0dfb86a55..09f3677393e 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/cast6.h>
#include <crypto/cryptd.h>
@@ -37,7 +38,6 @@
#include <crypto/xts.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#define CAST6_PARALLEL_BLOCKS 8
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 586f41aac36..5d1e0075ac2 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -24,10 +24,6 @@
.align 16
.Lbswap_mask:
.octa 0x000102030405060708090a0b0c0d0e0f
-.Lpoly:
- .octa 0xc2000000000000000000000000000001
-.Ltwo_one:
- .octa 0x00000001000000000000000000000001
#define DATA %xmm0
#define SHASH %xmm1
@@ -96,7 +92,7 @@ __clmul_gf128mul_ble:
ret
ENDPROC(__clmul_gf128mul_ble)
-/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+/* void clmul_ghash_mul(char *dst, const u128 *shash) */
ENTRY(clmul_ghash_mul)
movups (%rdi), DATA
movups (%rsi), SHASH
@@ -110,7 +106,7 @@ ENDPROC(clmul_ghash_mul)
/*
* void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- * const be128 *shash);
+ * const u128 *shash);
*/
ENTRY(clmul_ghash_update)
cmp $16, %rdx
@@ -134,28 +130,3 @@ ENTRY(clmul_ghash_update)
.Lupdate_just_ret:
ret
ENDPROC(clmul_ghash_update)
-
-/*
- * void clmul_ghash_setkey(be128 *shash, const u8 *key);
- *
- * Calculate hash_key << 1 mod poly
- */
-ENTRY(clmul_ghash_setkey)
- movaps .Lbswap_mask, BSWAP
- movups (%rsi), %xmm0
- PSHUFB_XMM BSWAP %xmm0
- movaps %xmm0, %xmm1
- psllq $1, %xmm0
- psrlq $63, %xmm1
- movaps %xmm1, %xmm2
- pslldq $8, %xmm1
- psrldq $8, %xmm2
- por %xmm1, %xmm0
- # reduction
- pshufd $0b00100100, %xmm2, %xmm1
- pcmpeqd .Ltwo_one, %xmm1
- pand .Lpoly, %xmm1
- pxor %xmm1, %xmm0
- movups %xmm0, (%rdi)
- ret
-ENDPROC(clmul_ghash_setkey)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 6759dd1135b..88bb7ba8b17 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -25,19 +25,17 @@
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
-void clmul_ghash_mul(char *dst, const be128 *shash);
+void clmul_ghash_mul(char *dst, const u128 *shash);
void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- const be128 *shash);
-
-void clmul_ghash_setkey(be128 *shash, const u8 *key);
+ const u128 *shash);
struct ghash_async_ctx {
struct cryptd_ahash *cryptd_tfm;
};
struct ghash_ctx {
- be128 shash;
+ u128 shash;
};
struct ghash_desc_ctx {
@@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm,
const u8 *key, unsigned int keylen)
{
struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+ be128 *x = (be128 *)key;
+ u64 a, b;
if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
- clmul_ghash_setkey(&ctx->shash, key);
+ /* perform multiplication by 'x' in GF(2^128) */
+ a = be64_to_cpu(x->a);
+ b = be64_to_cpu(x->b);
+
+ ctx->shash.a = (b << 1) | (a >> 63);
+ ctx->shash.b = (a << 1) | (b >> 63);
+
+ if (a >> 63)
+ ctx->shash.b ^= ((u64)0xc2) << 56;
return 0;
}
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 23aabc6c20a..2fae489b152 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/ctr.h>
#include <crypto/lrw.h>
@@ -22,7 +23,6 @@
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/serpent-avx.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#define SERPENT_AVX2_PARALLEL_BLOCKS 16
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 9ae83cf8d21..ff487087097 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/serpent.h>
#include <crypto/cryptd.h>
@@ -38,7 +39,6 @@
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/serpent-avx.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
/* 8-way parallel cipher functions */
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 97a356ece24..8c95f863730 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -34,6 +34,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/serpent.h>
#include <crypto/cryptd.h>
@@ -42,7 +43,6 @@
#include <crypto/lrw.h>
#include <crypto/xts.h>
#include <asm/crypto/serpent-sse2.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
new file mode 100644
index 00000000000..1cd792db15e
--- /dev/null
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -0,0 +1,708 @@
+/*
+ * Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ *Updates 20-byte SHA-1 record in 'hash' for even number of
+ *'num_blocks' consecutive 64-byte blocks
+ *
+ *extern "C" void sha1_transform_avx2(
+ * int *hash, const char* input, size_t num_blocks );
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi /* arg1 */
+#define BUF %rsi /* arg2 */
+#define CNT %rdx /* arg3 */
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %eax
+#define REG_E %edx
+#define REG_TB %ebx
+#define REG_TA %r12d
+#define REG_RA %rcx
+#define REG_RB %rsi
+#define REG_RC %rdi
+#define REG_RD %rax
+#define REG_RE %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1 %ebp
+#define xmm_mov vmovups
+#define avx2_zeroupper vzeroupper
+#define RND_F1 1
+#define RND_F2 2
+#define RND_F3 3
+
+.macro REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set TB, REG_TB
+ .set TA, REG_TA
+
+ .set RA, REG_RA
+ .set RB, REG_RB
+ .set RC, REG_RC
+ .set RD, REG_RD
+ .set RE, REG_RE
+
+ .set RTA, REG_RTA
+ .set RTB, REG_RTB
+
+ .set T1, REG_T1
+.endm
+
+#define K_BASE %r8
+#define HASH_PTR %r9
+#define BUFFER_PTR %r10
+#define BUFFER_PTR2 %r13
+#define BUFFER_END %r11
+
+#define PRECALC_BUF %r14
+#define WK_BUF %r15
+
+#define W_TMP %xmm0
+#define WY_TMP %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0 %ymm3
+#define WY4 %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP %ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ * - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+ .set WY_00, WY0
+ .set WY_04, WY4
+ .set WY_08, WY08
+ .set WY_12, WY12
+ .set WY_16, WY16
+ .set WY_20, WY20
+ .set WY_24, WY24
+ .set WY_28, WY28
+ .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+ /* Rotate macros */
+ .set WY_32, WY_28
+ .set WY_28, WY_24
+ .set WY_24, WY_20
+ .set WY_20, WY_16
+ .set WY_16, WY_12
+ .set WY_12, WY_08
+ .set WY_08, WY_04
+ .set WY_04, WY_00
+ .set WY_00, WY_32
+
+ /* Define register aliases */
+ .set WY, WY_00
+ .set WY_minus_04, WY_04
+ .set WY_minus_08, WY_08
+ .set WY_minus_12, WY_12
+ .set WY_minus_16, WY_16
+ .set WY_minus_20, WY_20
+ .set WY_minus_24, WY_24
+ .set WY_minus_28, WY_28
+ .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+ .if (i == 0) # Initialize and rotate registers
+ PRECALC_RESET_WY
+ PRECALC_ROTATE_WY
+ .endif
+
+ /* message scheduling pre-compute for rounds 0-15 */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+ .elseif ((i & 7) == 1)
+ vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+ WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+ .elseif ((i & 7) == 4)
+ vpaddd K_XMM(K_BASE), WY, WY_TMP
+ .elseif ((i & 7) == 7)
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_16_31
+ /*
+ * message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction
+ *
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency
+ */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ /* w[i-14] */
+ vpalignr $8, WY_minus_16, WY_minus_12, WY
+ vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_08, WY, WY
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpxor WY_TMP, WY, WY
+ vpslldq $12, WY, WY_TMP2
+ .elseif ((i & 7) == 3)
+ vpslld $1, WY, WY_TMP
+ vpsrld $31, WY, WY
+ .elseif ((i & 7) == 4)
+ vpor WY, WY_TMP, WY_TMP
+ vpslld $2, WY_TMP2, WY
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY_TMP2, WY_TMP2
+ vpxor WY, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 7)
+ vpxor WY_TMP2, WY_TMP, WY
+ vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_32_79
+ /*
+ * in SHA-1 specification:
+ * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:
+ * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization
+ * since w[i]=>w[i-3] dependency is broken
+ */
+
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+ .elseif ((i & 7) == 1)
+ /* W is W_minus_32 before xor */
+ vpxor WY_minus_28, WY, WY
+ .elseif ((i & 7) == 2)
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 3)
+ vpxor WY_TMP, WY, WY
+ .elseif ((i & 7) == 4)
+ vpslld $2, WY, WY_TMP
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY, WY
+ vpor WY, WY_TMP, WY
+ .elseif ((i & 7) == 7)
+ vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC r, s
+ .set i, \r
+
+ .if (i < 40)
+ .set K_XMM, 32*0
+ .elseif (i < 80)
+ .set K_XMM, 32*1
+ .elseif (i < 120)
+ .set K_XMM, 32*2
+ .else
+ .set K_XMM, 32*3
+ .endif
+
+ .if (i<32)
+ PRECALC_00_15 \s
+ .elseif (i<64)
+ PRECALC_16_31 \s
+ .elseif (i < 160)
+ PRECALC_32_79 \s
+ .endif
+.endm
+
+.macro ROTATE_STATE
+ .set T_REG, E
+ .set E, D
+ .set D, C
+ .set C, B
+ .set B, TB
+ .set TB, A
+ .set A, T_REG
+
+ .set T_REG, RE
+ .set RE, RD
+ .set RD, RC
+ .set RC, RB
+ .set RB, RTB
+ .set RTB, RA
+ .set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+ .if (\f == RND_F1)
+ ROUND_F1 \r
+ .elseif (\f == RND_F2)
+ ROUND_F2 \r
+ .elseif (\f == RND_F3)
+ ROUND_F3 \r
+ .endif
+.endm
+
+.macro RR r
+ .set round_id, (\r % 80)
+
+ .if (round_id == 0) /* Precalculate F for first round */
+ .set ROUND_FUNC, RND_F1
+ mov B, TB
+
+ rorx $(32-30), B, B /* b>>>2 */
+ andn D, TB, T1
+ and C, TB
+ xor T1, TB
+ .endif
+
+ RND_FUN ROUND_FUNC, \r
+ ROTATE_STATE
+
+ .if (round_id == 18)
+ .set ROUND_FUNC, RND_F2
+ .elseif (round_id == 38)
+ .set ROUND_FUNC, RND_F3
+ .elseif (round_id == 58)
+ .set ROUND_FUNC, RND_F2
+ .endif
+
+ .set round_id, ( (\r+1) % 80)
+
+ RND_FUN ROUND_FUNC, (\r+1)
+ ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+ add WK(\r), E
+
+ andn C, A, T1 /* ~b&d */
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30),A, TB /* b>>>2 for next round */
+
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ /*
+ * Calculate F for the next round
+ * (b & c) ^ andn[b, d]
+ */
+ and B, A /* b&c */
+ xor T1, A /* F1 = (b&c) ^ (~b&d) */
+
+ lea (RE,RTA), E /* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+ add WK(\r), E
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ /* Calculate F for the next round */
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ .if ((round_id) < 79)
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+ .endif
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ .if ((round_id) < 79)
+ xor B, A
+ .endif
+
+ add TA, E /* E += A >>> 5 */
+
+ .if ((round_id) < 79)
+ xor C, A
+ .endif
+.endm
+
+.macro ROUND_F3 r
+ add WK(\r), E
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ mov B, T1
+ or A, T1
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+
+ /* Calculate F for the next round
+ * (b and c) or (d and (b or c))
+ */
+ and C, T1
+ and B, A
+ or T1, A
+
+ add TA, E /* E += A >>> 5 */
+
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+ REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ mov %rsp, PRECALC_BUF
+ lea (2*4*80+32)(%rsp), WK_BUF
+
+ # Precalc WK for first 2 blocks
+ PRECALC_OFFSET = 0
+ .set i, 0
+ .rept 160
+ PRECALC i
+ .set i, i + 1
+ .endr
+ PRECALC_OFFSET = 128
+ xchg WK_BUF, PRECALC_BUF
+
+ .align 32
+_loop:
+ /*
+ * code loops through more than one block
+ * we use K_BASE value as a signal of a last block,
+ * it is set below by: cmovae BUFFER_PTR, K_BASE
+ */
+ cmp K_BASE, BUFFER_PTR
+ jne _begin
+ .align 32
+ jmp _end
+ .align 32
+_begin:
+
+ /*
+ * Do first block
+ * rounds: 0,2,4,6,8
+ */
+ .set j, 0
+ .rept 5
+ RR j
+ .set j, j+2
+ .endr
+
+ jmp _loop0
+_loop0:
+
+ /*
+ * rounds:
+ * 10,12,14,16,18
+ * 20,22,24,26,28
+ * 30,32,34,36,38
+ * 40,42,44,46,48
+ * 50,52,54,56,58
+ */
+ .rept 25
+ RR j
+ .set j, j+2
+ .endr
+
+ add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */
+ cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
+ cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
+
+ /*
+ * rounds
+ * 60,62,64,66,68
+ * 70,72,74,76,78
+ */
+ .rept 10
+ RR j
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ cmp K_BASE, BUFFER_PTR /* is current block the last one? */
+ je _loop
+
+ mov TB, B
+
+ /* Process second block */
+ /*
+ * rounds
+ * 0+80, 2+80, 4+80, 6+80, 8+80
+ * 10+80,12+80,14+80,16+80,18+80
+ */
+
+ .set j, 0
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ jmp _loop1
+_loop1:
+ /*
+ * rounds
+ * 20+80,22+80,24+80,26+80,28+80
+ * 30+80,32+80,34+80,36+80,38+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ jmp _loop2
+_loop2:
+
+ /*
+ * rounds
+ * 40+80,42+80,44+80,46+80,48+80
+ * 50+80,52+80,54+80,56+80,58+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */
+
+ cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
+ cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
+
+ jmp _loop3
+_loop3:
+
+ /*
+ * rounds
+ * 60+80,62+80,64+80,66+80,68+80
+ * 70+80,72+80,74+80,76+80,78+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ /* Reset state for AVX2 reg permutation */
+ mov A, TA
+ mov TB, A
+ mov C, TB
+ mov E, C
+ mov D, B
+ mov TA, D
+
+ REGALLOC
+
+ xchg WK_BUF, PRECALC_BUF
+
+ jmp _loop
+
+ .align 32
+ _end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ ENTRY(\name)
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ RESERVE_STACK = (W_SIZE*4 + 8+24)
+
+ /* Align stack */
+ mov %rsp, %rbx
+ and $~(0x20-1), %rsp
+ push %rbx
+ sub $RESERVE_STACK, %rsp
+
+ avx2_zeroupper
+
+ lea K_XMM_AR(%rip), K_BASE
+
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+ lea 64(BUF), BUFFER_PTR2
+
+ shl $6, CNT /* mul by 64 */
+ add BUF, CNT
+ add $64, CNT
+ mov CNT, BUFFER_END
+
+ cmp BUFFER_END, BUFFER_PTR2
+ cmovae K_BASE, BUFFER_PTR2
+
+ xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ avx2_zeroupper
+
+ add $RESERVE_STACK, %rsp
+ pop %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+
+ ENDPROC(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 4a11a9d7245..74d16ef707c 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -10,6 +10,7 @@
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
@@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
unsigned int rounds);
#endif
+#ifdef CONFIG_AS_AVX2
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds);
+#endif
static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
@@ -165,6 +172,18 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
return 0;
}
+#ifdef CONFIG_AS_AVX2
+static void sha1_apply_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds)
+{
+ /* Select the optimal transform based on data block size */
+ if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(digest, data, rounds);
+ else
+ sha1_transform_avx(digest, data, rounds);
+}
+#endif
+
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_ssse3_init,
@@ -201,27 +220,49 @@ static bool __init avx_usable(void)
return true;
}
+
+#ifdef CONFIG_AS_AVX2
+static bool __init avx2_usable(void)
+{
+ if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+#endif
#endif
static int __init sha1_ssse3_mod_init(void)
{
+ char *algo_name;
+
/* test for SSSE3 first */
- if (cpu_has_ssse3)
+ if (cpu_has_ssse3) {
sha1_transform_asm = sha1_transform_ssse3;
+ algo_name = "SSSE3";
+ }
#ifdef CONFIG_AS_AVX
/* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable())
+ if (avx_usable()) {
sha1_transform_asm = sha1_transform_avx;
+ algo_name = "AVX";
+#ifdef CONFIG_AS_AVX2
+ /* allow AVX2 to override AVX, it's a little faster */
+ if (avx2_usable()) {
+ sha1_transform_asm = sha1_apply_transform_avx2;
+ algo_name = "AVX2";
+ }
+#endif
+ }
#endif
if (sha1_transform_asm) {
- pr_info("Using %s optimized SHA-1 implementation\n",
- sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3"
- : "AVX");
+ pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
return crypto_register_shash(&alg);
}
- pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+ pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
return -ENODEV;
}
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 50226c4b86e..f248546da1c 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -281,7 +281,7 @@ static int __init sha256_ssse3_mod_init(void)
/* allow AVX to override SSSE3, it's a little faster */
if (avx_usable()) {
#ifdef CONFIG_AS_AVX2
- if (boot_cpu_has(X86_FEATURE_AVX2))
+ if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2))
sha256_transform_asm = sha256_transform_rorx;
else
#endif
@@ -319,4 +319,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS("sha256");
-MODULE_ALIAS("sha384");
+MODULE_ALIAS("sha224");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f30cd10293f..8626b03e83b 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index a62ba541884..4e3c665be12 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <crypto/cryptd.h>
@@ -39,7 +40,6 @@
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/twofish.h>
-#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#include <crypto/scatterwalk.h>
#include <linux/workqueue.h>