diff options
Diffstat (limited to 'arch/x86')
44 files changed, 2508 insertions, 837 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 903de4aa509..ebe7deedd5b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o @@ -19,3 +20,5 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o + +aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S index e41b147f450..b949ec2f9af 100644 --- a/arch/x86/crypto/aes-i586-asm_32.S +++ b/arch/x86/crypto/aes-i586-asm_32.S @@ -41,14 +41,14 @@ #define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) /* offsets to parameters with one register pushed onto stack */ -#define tfm 8 +#define ctx 8 #define out_blk 12 #define in_blk 16 -/* offsets in crypto_tfm structure */ -#define klen (crypto_tfm_ctx_offset + 0) -#define ekey (crypto_tfm_ctx_offset + 4) -#define dkey (crypto_tfm_ctx_offset + 244) +/* offsets in crypto_aes_ctx structure */ +#define klen (480) +#define ekey (0) +#define dkey (240) // register mapping for encrypt and decrypt subroutines @@ -217,7 +217,7 @@ do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */ // AES (Rijndael) Encryption Subroutine -/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */ +/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ .global aes_enc_blk @@ -228,7 +228,7 @@ aes_enc_blk: push %ebp - mov tfm(%esp),%ebp + mov ctx(%esp),%ebp // CAUTION: the order and the values used in these assigns // rely on the register mappings @@ -292,7 +292,7 @@ aes_enc_blk: ret // AES (Rijndael) Decryption Subroutine -/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */ +/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ .global aes_dec_blk @@ -303,7 +303,7 @@ aes_enc_blk: aes_dec_blk: push %ebp - mov tfm(%esp),%ebp + mov ctx(%esp),%ebp // CAUTION: the order and the values used in these assigns // rely on the register mappings diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index a120f526c3d..5b577d5a059 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -17,8 +17,6 @@ #include <asm/asm-offsets.h> -#define BASE crypto_tfm_ctx_offset - #define R1 %rax #define R1E %eax #define R1X %ax @@ -56,13 +54,13 @@ .align 8; \ FUNC: movq r1,r2; \ movq r3,r4; \ - leaq BASE+KEY+48+4(r8),r9; \ + leaq KEY+48(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ movl 4(r7),r1 ## E; \ movl 8(r7),r6 ## E; \ movl 12(r7),r7 ## E; \ - movl BASE+0(r8),r10 ## E; \ + movl 480(r8),r10 ## E; \ xorl -48(r9),r5 ## E; \ xorl -44(r9),r1 ## E; \ xorl -40(r9),r6 ## E; \ diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 71f45782711..49ae9fe32b2 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -5,17 +5,29 @@ #include <crypto/aes.h> -asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); -asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); +asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); +asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); + +void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) +{ + aes_enc_blk(ctx, dst, src); +} +EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86); + +void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) +{ + aes_dec_blk(ctx, dst, src); +} +EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86); static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - aes_enc_blk(tfm, dst, src); + aes_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - aes_dec_blk(tfm, dst, src); + aes_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static struct crypto_alg aes_alg = { diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S new file mode 100644 index 00000000000..caba9960170 --- /dev/null +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -0,0 +1,896 @@ +/* + * Implement AES algorithm in Intel AES-NI instructions. + * + * The white paper of AES-NI instructions can be downloaded from: + * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf + * + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Kahraman Akdemir + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.text + +#define STATE1 %xmm0 +#define STATE2 %xmm4 +#define STATE3 %xmm5 +#define STATE4 %xmm6 +#define STATE STATE1 +#define IN1 %xmm1 +#define IN2 %xmm7 +#define IN3 %xmm8 +#define IN4 %xmm9 +#define IN IN1 +#define KEY %xmm2 +#define IV %xmm3 + +#define KEYP %rdi +#define OUTP %rsi +#define INP %rdx +#define LEN %rcx +#define IVP %r8 +#define KLEN %r9d +#define T1 %r10 +#define TKEYP T1 +#define T2 %r11 + +_key_expansion_128: +_key_expansion_256a: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + +_key_expansion_192a: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + movaps %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movaps %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movaps %xmm1, 16(%rcx) + add $0x20, %rcx + ret + +_key_expansion_192b: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + +_key_expansion_256b: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movaps %xmm2, (%rcx) + add $0x10, %rcx + ret + +/* + * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + * unsigned int key_len) + */ +ENTRY(aesni_set_key) + movups (%rsi), %xmm0 # user key (first 16 bytes) + movaps %xmm0, (%rdi) + lea 0x10(%rdi), %rcx # key addr + movl %edx, 480(%rdi) + pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x + cmp $24, %dl + jb .Lenc_key128 + je .Lenc_key192 + movups 0x10(%rsi), %xmm2 # other user key + movaps %xmm2, (%rcx) + add $0x10, %rcx + # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + call _key_expansion_256a + # aeskeygenassist $0x1, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + call _key_expansion_256b + # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + call _key_expansion_256a + # aeskeygenassist $0x2, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + call _key_expansion_256b + # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + call _key_expansion_256a + # aeskeygenassist $0x4, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + call _key_expansion_256b + # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + call _key_expansion_256a + # aeskeygenassist $0x8, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + call _key_expansion_256b + # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + call _key_expansion_256a + # aeskeygenassist $0x10, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + call _key_expansion_256b + # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + call _key_expansion_256a + # aeskeygenassist $0x20, %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + call _key_expansion_256b + # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + call _key_expansion_256a + jmp .Ldec_key +.Lenc_key192: + movq 0x10(%rsi), %xmm2 # other user key + # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + call _key_expansion_192a + # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + call _key_expansion_192b + # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + call _key_expansion_192a + # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + call _key_expansion_192b + # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + call _key_expansion_192a + # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + call _key_expansion_192b + # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + call _key_expansion_192a + # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80 + call _key_expansion_192b + jmp .Ldec_key +.Lenc_key128: + # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + call _key_expansion_128 + # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + call _key_expansion_128 + # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + call _key_expansion_128 + # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + call _key_expansion_128 + # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + call _key_expansion_128 + # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + call _key_expansion_128 + # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40 + call _key_expansion_128 + # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80 + call _key_expansion_128 + # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b + call _key_expansion_128 + # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 + .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36 + call _key_expansion_128 +.Ldec_key: + sub $0x10, %rcx + movaps (%rdi), %xmm0 + movaps (%rcx), %xmm1 + movaps %xmm0, 240(%rcx) + movaps %xmm1, 240(%rdi) + add $0x10, %rdi + lea 240-16(%rcx), %rsi +.align 4 +.Ldec_key_loop: + movaps (%rdi), %xmm0 + # aesimc %xmm0, %xmm1 + .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8 + movaps %xmm1, (%rsi) + add $0x10, %rdi + sub $0x10, %rsi + cmp %rcx, %rdi + jb .Ldec_key_loop + xor %rax, %rax + ret + +/* + * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + */ +ENTRY(aesni_enc) + movl 480(KEYP), KLEN # key length + movups (INP), STATE # input + call _aesni_enc1 + movups STATE, (OUTP) # output + ret + +/* + * _aesni_enc1: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: round count + * STATE: initial state (input) + * output: + * STATE: finial state (output) + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_enc1: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE # round 0 + add $0x30, TKEYP + cmp $24, KLEN + jb .Lenc128 + lea 0x20(TKEYP), TKEYP + je .Lenc192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps -0x50(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 +.align 4 +.Lenc192: + movaps -0x40(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps -0x30(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 +.align 4 +.Lenc128: + movaps -0x20(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps -0x10(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps (TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x10(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x20(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x30(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x40(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x50(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x60(TKEYP), KEY + # aesenc KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + movaps 0x70(TKEYP), KEY + # aesenclast KEY, STATE # last round + .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + ret + +/* + * _aesni_enc4: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: round count + * STATE1: initial state (input) + * STATE2 + * STATE3 + * STATE4 + * output: + * STATE1: finial state (output) + * STATE2 + * STATE3 + * STATE4 + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_enc4: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE1 # round 0 + pxor KEY, STATE2 + pxor KEY, STATE3 + pxor KEY, STATE4 + add $0x30, TKEYP + cmp $24, KLEN + jb .L4enc128 + lea 0x20(TKEYP), TKEYP + je .L4enc192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps -0x50(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 +#.align 4 +.L4enc192: + movaps -0x40(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps -0x30(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 +#.align 4 +.L4enc128: + movaps -0x20(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps -0x10(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps (TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x10(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x20(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x30(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x40(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x50(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x60(TKEYP), KEY + # aesenc KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + # aesenc KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 + # aesenc KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xea + # aesenc KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + movaps 0x70(TKEYP), KEY + # aesenclast KEY, STATE1 # last round + .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + # aesenclast KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 + # aesenclast KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xdd, 0xea + # aesenclast KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2 + ret + +/* + * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + */ +ENTRY(aesni_dec) + mov 480(KEYP), KLEN # key length + add $240, KEYP + movups (INP), STATE # input + call _aesni_dec1 + movups STATE, (OUTP) #output + ret + +/* + * _aesni_dec1: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: key length + * STATE: initial state (input) + * output: + * STATE: finial state (output) + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_dec1: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE # round 0 + add $0x30, TKEYP + cmp $24, KLEN + jb .Ldec128 + lea 0x20(TKEYP), TKEYP + je .Ldec192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps -0x50(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 +.align 4 +.Ldec192: + movaps -0x40(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps -0x30(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 +.align 4 +.Ldec128: + movaps -0x20(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps -0x10(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps (TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x10(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x20(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x30(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x40(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x50(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x60(TKEYP), KEY + # aesdec KEY, STATE + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + movaps 0x70(TKEYP), KEY + # aesdeclast KEY, STATE # last round + .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 + ret + +/* + * _aesni_dec4: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: key length + * STATE1: initial state (input) + * STATE2 + * STATE3 + * STATE4 + * output: + * STATE1: finial state (output) + * STATE2 + * STATE3 + * STATE4 + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_dec4: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE1 # round 0 + pxor KEY, STATE2 + pxor KEY, STATE3 + pxor KEY, STATE4 + add $0x30, TKEYP + cmp $24, KLEN + jb .L4dec128 + lea 0x20(TKEYP), TKEYP + je .L4dec192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps -0x50(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 +.align 4 +.L4dec192: + movaps -0x40(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps -0x30(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 +.align 4 +.L4dec128: + movaps -0x20(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps -0x10(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps (TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x10(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + # aesdec KEY, STATE2 + .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 + # aesdec KEY, STATE3 + .byte 0x66, 0x0f, 0x38, 0xde, 0xea + # aesdec KEY, STATE4 + .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + movaps 0x20(TKEYP), KEY + # aesdec KEY, STATE1 + .byte 0x66, 0x0f, |