diff options
Diffstat (limited to 'arch/arm64/crypto')
| -rw-r--r-- | arch/arm64/crypto/Kconfig | 53 | ||||
| -rw-r--r-- | arch/arm64/crypto/Makefile | 38 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-core.S | 222 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-glue.c | 297 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-ce-cipher.c | 155 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-ce.S | 133 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-glue.c | 446 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-modes.S | 532 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-neon.S | 382 | ||||
| -rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 79 | ||||
| -rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 156 | ||||
| -rw-r--r-- | arch/arm64/crypto/sha1-ce-core.S | 153 | ||||
| -rw-r--r-- | arch/arm64/crypto/sha1-ce-glue.c | 174 | ||||
| -rw-r--r-- | arch/arm64/crypto/sha2-ce-core.S | 156 | ||||
| -rw-r--r-- | arch/arm64/crypto/sha2-ce-glue.c | 255 | 
15 files changed, 3231 insertions, 0 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig new file mode 100644 index 00000000000..5562652c531 --- /dev/null +++ b/arch/arm64/crypto/Kconfig @@ -0,0 +1,53 @@ + +menuconfig ARM64_CRYPTO +	bool "ARM64 Accelerated Cryptographic Algorithms" +	depends on ARM64 +	help +	  Say Y here to choose from a selection of cryptographic algorithms +	  implemented using ARM64 specific CPU features or instructions. + +if ARM64_CRYPTO + +config CRYPTO_SHA1_ARM64_CE +	tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_HASH + +config CRYPTO_SHA2_ARM64_CE +	tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_HASH + +config CRYPTO_GHASH_ARM64_CE +	tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_HASH + +config CRYPTO_AES_ARM64_CE +	tristate "AES core cipher using ARMv8 Crypto Extensions" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_ALGAPI +	select CRYPTO_AES + +config CRYPTO_AES_ARM64_CE_CCM +	tristate "AES in CCM mode using ARMv8 Crypto Extensions" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_ALGAPI +	select CRYPTO_AES +	select CRYPTO_AEAD + +config CRYPTO_AES_ARM64_CE_BLK +	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_BLKCIPHER +	select CRYPTO_AES +	select CRYPTO_ABLK_HELPER + +config CRYPTO_AES_ARM64_NEON_BLK +	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" +	depends on ARM64 && KERNEL_MODE_NEON +	select CRYPTO_BLKCIPHER +	select CRYPTO_AES +	select CRYPTO_ABLK_HELPER + +endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile new file mode 100644 index 00000000000..2070a56ecc4 --- /dev/null +++ b/arch/arm64/crypto/Makefile @@ -0,0 +1,38 @@ +# +# linux/arch/arm64/crypto/Makefile +# +# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# + +obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o +sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o + +obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o +sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o + +obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o +ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o +CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o +aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o +aes-ce-blk-y := aes-glue-ce.o aes-ce.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o +aes-neon-blk-y := aes-glue-neon.o aes-neon.o + +AFLAGS_aes-ce.o		:= -DINTERLEAVE=2 -DINTERLEAVE_INLINE +AFLAGS_aes-neon.o	:= -DINTERLEAVE=4 + +CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS + +$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE +	$(call if_changed_dep,cc_o_c) diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S new file mode 100644 index 00000000000..432e4841cd8 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -0,0 +1,222 @@ +/* + * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +	.text +	.arch	armv8-a+crypto + +	/* +	 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, +	 *			     u32 *macp, u8 const rk[], u32 rounds); +	 */ +ENTRY(ce_aes_ccm_auth_data) +	ldr	w8, [x3]			/* leftover from prev round? */ +	ld1	{v0.2d}, [x0]			/* load mac */ +	cbz	w8, 1f +	sub	w8, w8, #16 +	eor	v1.16b, v1.16b, v1.16b +0:	ldrb	w7, [x1], #1			/* get 1 byte of input */ +	subs	w2, w2, #1 +	add	w8, w8, #1 +	ins	v1.b[0], w7 +	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */ +	beq	8f				/* out of input? */ +	cbnz	w8, 0b +	eor	v0.16b, v0.16b, v1.16b +1:	ld1	{v3.2d}, [x4]			/* load first round key */ +	prfm	pldl1strm, [x1] +	cmp	w5, #12				/* which key size? */ +	add	x6, x4, #16 +	sub	w7, w5, #2			/* modified # of rounds */ +	bmi	2f +	bne	5f +	mov	v5.16b, v3.16b +	b	4f +2:	mov	v4.16b, v3.16b +	ld1	{v5.2d}, [x6], #16		/* load 2nd round key */ +3:	aese	v0.16b, v4.16b +	aesmc	v0.16b, v0.16b +4:	ld1	{v3.2d}, [x6], #16		/* load next round key */ +	aese	v0.16b, v5.16b +	aesmc	v0.16b, v0.16b +5:	ld1	{v4.2d}, [x6], #16		/* load next round key */ +	subs	w7, w7, #3 +	aese	v0.16b, v3.16b +	aesmc	v0.16b, v0.16b +	ld1	{v5.2d}, [x6], #16		/* load next round key */ +	bpl	3b +	aese	v0.16b, v4.16b +	subs	w2, w2, #16			/* last data? */ +	eor	v0.16b, v0.16b, v5.16b		/* final round */ +	bmi	6f +	ld1	{v1.16b}, [x1], #16		/* load next input block */ +	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */ +	bne	1b +6:	st1	{v0.2d}, [x0]			/* store mac */ +	beq	10f +	adds	w2, w2, #16 +	beq	10f +	mov	w8, w2 +7:	ldrb	w7, [x1], #1 +	umov	w6, v0.b[0] +	eor	w6, w6, w7 +	strb	w6, [x0], #1 +	subs	w2, w2, #1 +	beq	10f +	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */ +	b	7b +8:	mov	w7, w8 +	add	w8, w8, #16 +9:	ext	v1.16b, v1.16b, v1.16b, #1 +	adds	w7, w7, #1 +	bne	9b +	eor	v0.16b, v0.16b, v1.16b +	st1	{v0.2d}, [x0] +10:	str	w8, [x3] +	ret +ENDPROC(ce_aes_ccm_auth_data) + +	/* +	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], +	 * 			 u32 rounds); +	 */ +ENTRY(ce_aes_ccm_final) +	ld1	{v3.2d}, [x2], #16		/* load first round key */ +	ld1	{v0.2d}, [x0]			/* load mac */ +	cmp	w3, #12				/* which key size? */ +	sub	w3, w3, #2			/* modified # of rounds */ +	ld1	{v1.2d}, [x1]			/* load 1st ctriv */ +	bmi	0f +	bne	3f +	mov	v5.16b, v3.16b +	b	2f +0:	mov	v4.16b, v3.16b +1:	ld1	{v5.2d}, [x2], #16		/* load next round key */ +	aese	v0.16b, v4.16b +	aese	v1.16b, v4.16b +	aesmc	v0.16b, v0.16b +	aesmc	v1.16b, v1.16b +2:	ld1	{v3.2d}, [x2], #16		/* load next round key */ +	aese	v0.16b, v5.16b +	aese	v1.16b, v5.16b +	aesmc	v0.16b, v0.16b +	aesmc	v1.16b, v1.16b +3:	ld1	{v4.2d}, [x2], #16		/* load next round key */ +	subs	w3, w3, #3 +	aese	v0.16b, v3.16b +	aese	v1.16b, v3.16b +	aesmc	v0.16b, v0.16b +	aesmc	v1.16b, v1.16b +	bpl	1b +	aese	v0.16b, v4.16b +	aese	v1.16b, v4.16b +	/* final round key cancels out */ +	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */ +	st1	{v0.2d}, [x0]			/* store result */ +	ret +ENDPROC(ce_aes_ccm_final) + +	.macro	aes_ccm_do_crypt,enc +	ldr	x8, [x6, #8]			/* load lower ctr */ +	ld1	{v0.2d}, [x5]			/* load mac */ +	rev	x8, x8				/* keep swabbed ctr in reg */ +0:	/* outer loop */ +	ld1	{v1.1d}, [x6]			/* load upper ctr */ +	prfm	pldl1strm, [x1] +	add	x8, x8, #1 +	rev	x9, x8 +	cmp	w4, #12				/* which key size? */ +	sub	w7, w4, #2			/* get modified # of rounds */ +	ins	v1.d[1], x9			/* no carry in lower ctr */ +	ld1	{v3.2d}, [x3]			/* load first round key */ +	add	x10, x3, #16 +	bmi	1f +	bne	4f +	mov	v5.16b, v3.16b +	b	3f +1:	mov	v4.16b, v3.16b +	ld1	{v5.2d}, [x10], #16		/* load 2nd round key */ +2:	/* inner loop: 3 rounds, 2x interleaved */ +	aese	v0.16b, v4.16b +	aese	v1.16b, v4.16b +	aesmc	v0.16b, v0.16b +	aesmc	v1.16b, v1.16b +3:	ld1	{v3.2d}, [x10], #16		/* load next round key */ +	aese	v0.16b, v5.16b +	aese	v1.16b, v5.16b +	aesmc	v0.16b, v0.16b +	aesmc	v1.16b, v1.16b +4:	ld1	{v4.2d}, [x10], #16		/* load next round key */ +	subs	w7, w7, #3 +	aese	v0.16b, v3.16b +	aese	v1.16b, v3.16b +	aesmc	v0.16b, v0.16b +	aesmc	v1.16b, v1.16b +	ld1	{v5.2d}, [x10], #16		/* load next round key */ +	bpl	2b +	aese	v0.16b, v4.16b +	aese	v1.16b, v4.16b +	subs	w2, w2, #16 +	bmi	6f				/* partial block? */ +	ld1	{v2.16b}, [x1], #16		/* load next input block */ +	.if	\enc == 1 +	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */ +	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */ +	.else +	eor	v2.16b, v2.16b, v1.16b		/* xor with crypted ctr */ +	eor	v1.16b, v2.16b, v5.16b		/* final round enc */ +	.endif +	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */ +	st1	{v1.16b}, [x0], #16		/* write output block */ +	bne	0b +	rev	x8, x8 +	st1	{v0.2d}, [x5]			/* store mac */ +	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */ +5:	ret + +6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */ +	eor	v1.16b, v1.16b, v5.16b		/* final round enc */ +	st1	{v0.2d}, [x5]			/* store mac */ +	add	w2, w2, #16			/* process partial tail block */ +7:	ldrb	w9, [x1], #1			/* get 1 byte of input */ +	umov	w6, v1.b[0]			/* get top crypted ctr byte */ +	umov	w7, v0.b[0]			/* get top mac byte */ +	.if	\enc == 1 +	eor	w7, w7, w9 +	eor	w9, w9, w6 +	.else +	eor	w9, w9, w6 +	eor	w7, w7, w9 +	.endif +	strb	w9, [x0], #1			/* store out byte */ +	strb	w7, [x5], #1			/* store mac byte */ +	subs	w2, w2, #1 +	beq	5b +	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */ +	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */ +	b	7b +	.endm + +	/* +	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, +	 * 			   u8 const rk[], u32 rounds, u8 mac[], +	 * 			   u8 ctr[]); +	 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, +	 * 			   u8 const rk[], u32 rounds, u8 mac[], +	 * 			   u8 ctr[]); +	 */ +ENTRY(ce_aes_ccm_encrypt) +	aes_ccm_do_crypt	1 +ENDPROC(ce_aes_ccm_encrypt) + +ENTRY(ce_aes_ccm_decrypt) +	aes_ccm_do_crypt	0 +ENDPROC(ce_aes_ccm_decrypt) diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c new file mode 100644 index 00000000000..9e6cdde9b43 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -0,0 +1,297 @@ +/* + * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/scatterwalk.h> +#include <linux/crypto.h> +#include <linux/module.h> + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ +	/* +	 * # of rounds specified by AES: +	 * 128 bit key		10 rounds +	 * 192 bit key		12 rounds +	 * 256 bit key		14 rounds +	 * => n byte key	=> 6 + (n/4) rounds +	 */ +	return 6 + ctx->key_length / 4; +} + +asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, +				     u32 *macp, u32 const rk[], u32 rounds); + +asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, +				   u32 const rk[], u32 rounds, u8 mac[], +				   u8 ctr[]); + +asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, +				   u32 const rk[], u32 rounds, u8 mac[], +				   u8 ctr[]); + +asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], +				 u32 rounds); + +static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, +		      unsigned int key_len) +{ +	struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); +	int ret; + +	ret = crypto_aes_expand_key(ctx, in_key, key_len); +	if (!ret) +		return 0; + +	tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; +	return -EINVAL; +} + +static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ +	if ((authsize & 1) || authsize < 4) +		return -EINVAL; +	return 0; +} + +static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	__be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; +	u32 l = req->iv[0] + 1; + +	/* verify that CCM dimension 'L' is set correctly in the IV */ +	if (l < 2 || l > 8) +		return -EINVAL; + +	/* verify that msglen can in fact be represented in L bytes */ +	if (l < 4 && msglen >> (8 * l)) +		return -EOVERFLOW; + +	/* +	 * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi +	 * uses a u32 type to represent msglen so the top 4 bytes are always 0. +	 */ +	n[0] = 0; +	n[1] = cpu_to_be32(msglen); + +	memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); + +	/* +	 * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) +	 * - bits 0..2	: max # of bytes required to represent msglen, minus 1 +	 *                (already set by caller) +	 * - bits 3..5	: size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) +	 * - bit 6	: indicates presence of authenticate-only data +	 */ +	maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; +	if (req->assoclen) +		maciv[0] |= 0x40; + +	memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); +	return 0; +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); +	struct __packed { __be16 l; __be32 h; u16 len; } ltag; +	struct scatter_walk walk; +	u32 len = req->assoclen; +	u32 macp = 0; + +	/* prepend the AAD with a length tag */ +	if (len < 0xff00) { +		ltag.l = cpu_to_be16(len); +		ltag.len = 2; +	} else  { +		ltag.l = cpu_to_be16(0xfffe); +		put_unaligned_be32(len, <ag.h); +		ltag.len = 6; +	} + +	ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, +			     num_rounds(ctx)); +	scatterwalk_start(&walk, req->assoc); + +	do { +		u32 n = scatterwalk_clamp(&walk, len); +		u8 *p; + +		if (!n) { +			scatterwalk_start(&walk, sg_next(walk.sg)); +			n = scatterwalk_clamp(&walk, len); +		} +		p = scatterwalk_map(&walk); +		ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, +				     num_rounds(ctx)); +		len -= n; + +		scatterwalk_unmap(p); +		scatterwalk_advance(&walk, n); +		scatterwalk_done(&walk, 0, len); +	} while (len); +} + +static int ccm_encrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); +	struct blkcipher_desc desc = { .info = req->iv }; +	struct blkcipher_walk walk; +	u8 __aligned(8) mac[AES_BLOCK_SIZE]; +	u8 buf[AES_BLOCK_SIZE]; +	u32 len = req->cryptlen; +	int err; + +	err = ccm_init_mac(req, mac, len); +	if (err) +		return err; + +	kernel_neon_begin_partial(6); + +	if (req->assoclen) +		ccm_calculate_auth_mac(req, mac); + +	/* preserve the original iv for the final round */ +	memcpy(buf, req->iv, AES_BLOCK_SIZE); + +	blkcipher_walk_init(&walk, req->dst, req->src, len); +	err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, +					     AES_BLOCK_SIZE); + +	while (walk.nbytes) { +		u32 tail = walk.nbytes % AES_BLOCK_SIZE; + +		if (walk.nbytes == len) +			tail = 0; + +		ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, +				   walk.nbytes - tail, ctx->key_enc, +				   num_rounds(ctx), mac, walk.iv); + +		len -= walk.nbytes - tail; +		err = blkcipher_walk_done(&desc, &walk, tail); +	} +	if (!err) +		ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + +	kernel_neon_end(); + +	if (err) +		return err; + +	/* copy authtag to end of dst */ +	scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, +				 crypto_aead_authsize(aead), 1); + +	return 0; +} + +static int ccm_decrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); +	unsigned int authsize = crypto_aead_authsize(aead); +	struct blkcipher_desc desc = { .info = req->iv }; +	struct blkcipher_walk walk; +	u8 __aligned(8) mac[AES_BLOCK_SIZE]; +	u8 buf[AES_BLOCK_SIZE]; +	u32 len = req->cryptlen - authsize; +	int err; + +	err = ccm_init_mac(req, mac, len); +	if (err) +		return err; + +	kernel_neon_begin_partial(6); + +	if (req->assoclen) +		ccm_calculate_auth_mac(req, mac); + +	/* preserve the original iv for the final round */ +	memcpy(buf, req->iv, AES_BLOCK_SIZE); + +	blkcipher_walk_init(&walk, req->dst, req->src, len); +	err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, +					     AES_BLOCK_SIZE); + +	while (walk.nbytes) { +		u32 tail = walk.nbytes % AES_BLOCK_SIZE; + +		if (walk.nbytes == len) +			tail = 0; + +		ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, +				   walk.nbytes - tail, ctx->key_enc, +				   num_rounds(ctx), mac, walk.iv); + +		len -= walk.nbytes - tail; +		err = blkcipher_walk_done(&desc, &walk, tail); +	} +	if (!err) +		ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + +	kernel_neon_end(); + +	if (err) +		return err; + +	/* compare calculated auth tag with the stored one */ +	scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, +				 authsize, 0); + +	if (memcmp(mac, buf, authsize)) +		return -EBADMSG; +	return 0; +} + +static struct crypto_alg ccm_aes_alg = { +	.cra_name		= "ccm(aes)", +	.cra_driver_name	= "ccm-aes-ce", +	.cra_priority		= 300, +	.cra_flags		= CRYPTO_ALG_TYPE_AEAD, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct crypto_aes_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_aead_type, +	.cra_module		= THIS_MODULE, +	.cra_aead = { +		.ivsize		= AES_BLOCK_SIZE, +		.maxauthsize	= AES_BLOCK_SIZE, +		.setkey		= ccm_setkey, +		.setauthsize	= ccm_setauthsize, +		.encrypt	= ccm_encrypt, +		.decrypt	= ccm_decrypt, +	} +}; + +static int __init aes_mod_init(void) +{ +	if (!(elf_hwcap & HWCAP_AES)) +		return -ENODEV; +	return crypto_register_alg(&ccm_aes_alg); +} + +static void __exit aes_mod_exit(void) +{ +	crypto_unregister_alg(&ccm_aes_alg); +} + +module_init(aes_mod_init); +module_exit(aes_mod_exit); + +MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ccm(aes)"); diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c new file mode 100644 index 00000000000..2075e1acae6 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -0,0 +1,155 @@ +/* + * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions + * + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <crypto/aes.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +struct aes_block { +	u8 b[AES_BLOCK_SIZE]; +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ +	/* +	 * # of rounds specified by AES: +	 * 128 bit key		10 rounds +	 * 192 bit key		12 rounds +	 * 256 bit key		14 rounds +	 * => n byte key	=> 6 + (n/4) rounds +	 */ +	return 6 + ctx->key_length / 4; +} + +static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) +{ +	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); +	struct aes_block *out = (struct aes_block *)dst; +	struct aes_block const *in = (struct aes_block *)src; +	void *dummy0; +	int dummy1; + +	kernel_neon_begin_partial(4); + +	__asm__("	ld1	{v0.16b}, %[in]			;" +		"	ld1	{v1.2d}, [%[key]], #16		;" +		"	cmp	%w[rounds], #10			;" +		"	bmi	0f				;" +		"	bne	3f				;" +		"	mov	v3.16b, v1.16b			;" +		"	b	2f				;" +		"0:	mov	v2.16b, v1.16b			;" +		"	ld1	{v3.2d}, [%[key]], #16		;" +		"1:	aese	v0.16b, v2.16b			;" +		"	aesmc	v0.16b, v0.16b			;" +		"2:	ld1	{v1.2d}, [%[key]], #16		;" +		"	aese	v0.16b, v3.16b			;" +		"	aesmc	v0.16b, v0.16b			;" +		"3:	ld1	{v2.2d}, [%[key]], #16		;" +		"	subs	%w[rounds], %w[rounds], #3	;" +		"	aese	v0.16b, v1.16b			;" +		"	aesmc	v0.16b, v0.16b			;" +		"	ld1	{v3.2d}, [%[key]], #16		;" +		"	bpl	1b				;" +		"	aese	v0.16b, v2.16b			;" +		"	eor	v0.16b, v0.16b, v3.16b		;" +		"	st1	{v0.16b}, %[out]		;" + +	:	[out]		"=Q"(*out), +		[key]		"=r"(dummy0), +		[rounds]	"=r"(dummy1) +	:	[in]		"Q"(*in), +				"1"(ctx->key_enc), +				"2"(num_rounds(ctx) - 2) +	:	"cc"); + +	kernel_neon_end(); +} + +static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) +{ +	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); +	struct aes_block *out = (struct aes_block *)dst; +	struct aes_block const *in = (struct aes_block *)src; +	void *dummy0; +	int dummy1; + +	kernel_neon_begin_partial(4); + +	__asm__("	ld1	{v0.16b}, %[in]			;" +		"	ld1	{v1.2d}, [%[key]], #16		;" +		"	cmp	%w[rounds], #10			;" +		"	bmi	0f				;" +		"	bne	3f				;" +		"	mov	v3.16b, v1.16b			;" +		"	b	2f				;" +		"0:	mov	v2.16b, v1.16b			;" +		"	ld1	{v3.2d}, [%[key]], #16		;" +		"1:	aesd	v0.16b, v2.16b			;" +		"	aesimc	v0.16b, v0.16b			;" +		"2:	ld1	{v1.2d}, [%[key]], #16		;" +		"	aesd	v0.16b, v3.16b			;" +		"	aesimc	v0.16b, v0.16b			;" +		"3:	ld1	{v2.2d}, [%[key]], #16		;" +		"	subs	%w[rounds], %w[rounds], #3	;" +		"	aesd	v0.16b, v1.16b			;" +		"	aesimc	v0.16b, v0.16b			;" +		"	ld1	{v3.2d}, [%[key]], #16		;" +		"	bpl	1b				;" +		"	aesd	v0.16b, v2.16b			;" +		"	eor	v0.16b, v0.16b, v3.16b		;" +		"	st1	{v0.16b}, %[out]		;" + +	:	[out]		"=Q"(*out), +		[key]		"=r"(dummy0), +		[rounds]	"=r"(dummy1) +	:	[in]		"Q"(*in), +				"1"(ctx->key_dec), +				"2"(num_rounds(ctx) - 2) +	:	"cc"); + +	kernel_neon_end(); +} + +static struct crypto_alg aes_alg = { +	.cra_name		= "aes", +	.cra_driver_name	= "aes-ce", +	.cra_priority		= 300, +	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct crypto_aes_ctx), +	.cra_module		= THIS_MODULE, +	.cra_cipher = { +		.cia_min_keysize	= AES_MIN_KEY_SIZE, +		.cia_max_keysize	= AES_MAX_KEY_SIZE, +		.cia_setkey		= crypto_aes_set_key, +		.cia_encrypt		= aes_cipher_encrypt, +		.cia_decrypt		= aes_cipher_decrypt +	} +}; + +static int __init aes_mod_init(void) +{ +	return crypto_register_alg(&aes_alg); +} + +static void __exit aes_mod_exit(void) +{ +	crypto_unregister_alg(&aes_alg); +} + +module_cpu_feature_match(AES, aes_mod_init); +module_exit(aes_mod_exit); diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S new file mode 100644 index 00000000000..685a18f731e --- /dev/null +++ b/arch/arm64/crypto/aes-ce.S @@ -0,0 +1,133 @@ +/* + * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with + *                                    Crypto Extensions + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#define AES_ENTRY(func)		ENTRY(ce_ ## func) +#define AES_ENDPROC(func)	ENDPROC(ce_ ## func) + +	.arch		armv8-a+crypto + +	/* preload all round keys */ +	.macro		load_round_keys, rounds, rk +	cmp		\rounds, #12 +	blo		2222f		/* 128 bits */ +	beq		1111f		/* 192 bits */ +	ld1		{v17.16b-v18.16b}, [\rk], #32 +1111:	ld1		{v19.16b-v20.16b}, [\rk], #32 +2222:	ld1		{v21.16b-v24.16b}, [\rk], #64 +	ld1		{v25.16b-v28.16b}, [\rk], #64 +	ld1		{v29.16b-v31.16b}, [\rk] +	.endm + +	/* prepare for encryption with key in rk[] */ +	.macro		enc_prepare, rounds, rk, ignore +	load_round_keys	\rounds, \rk +	.endm + +	/* prepare for encryption (again) but with new key in rk[] */ +	.macro		enc_switch_key, rounds, rk, ignore +	load_round_keys	\rounds, \rk +	.endm + +	/* prepare for decryption with key in rk[] */ +	.macro		dec_prepare, rounds, rk, ignore +	load_round_keys	\rounds, \rk +	.endm + +	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3 +	aes\de		\i0\().16b, \k\().16b +	.ifnb		\i1 +	aes\de		\i1\().16b, \k\().16b +	.ifnb		\i3 +	aes\de		\i2\().16b, \k\().16b +	aes\de		\i3\().16b, \k\().16b +	.endif +	.endif +	aes\mc		\i0\().16b, \i0\().16b +	.ifnb		\i1 +	aes\mc		\i1\().16b, \i1\().16b +	.ifnb		\i3 +	aes\mc		\i2\().16b, \i2\().16b +	aes\mc		\i3\().16b, \i3\().16b +	.endif +	.endif +	.endm + +	/* up to 4 interleaved encryption rounds with the same round key */ +	.macro		round_Nx, enc, k, i0, i1, i2, i3 +	.ifc		\enc, e +	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3 +	.else +	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3 +	.endif +	.endm + +	/* up to 4 interleaved final rounds */ +	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3 +	aes\de		\i0\().16b, \k\().16b +	.ifnb		\i1 +	aes\de		\i1\().16b, \k\().16b +	.ifnb		\i3 +	aes\de		\i2\().16b, \k\().16b +	aes\de		\i3\().16b, \k\().16b +	.endif +	.endif +	eor		\i0\().16b, \i0\().16b, \k2\().16b +	.ifnb		\i1 +	eor		\i1\().16b, \i1\().16b, \k2\().16b +	.ifnb		\i3 +	eor		\i2\().16b, \i2\().16b, \k2\().16b +	eor		\i3\().16b, \i3\().16b, \k2\().16b +	.endif +	.endif +	.endm + +	/* up to 4 interleaved blocks */ +	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3 +	cmp		\rounds, #12 +	blo		2222f		/* 128 bits */ +	beq		1111f		/* 192 bits */ +	round_Nx	\enc, v17, \i0, \i1, \i2, \i3 +	round_Nx	\enc, v18, \i0, \i1, \i2, \i3 +1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3 +	round_Nx	\enc, v20, \i0, \i1, \i2, \i3 +2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29 +	round_Nx	\enc, \key, \i0, \i1, \i2, \i3 +	.endr +	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3 +	.endm + +	.macro		encrypt_block, in, rounds, t0, t1, t2 +	do_block_Nx	e, \rounds, \in +	.endm + +	.macro		encrypt_block2x, i0, i1, rounds, t0, t1, t2 +	do_block_Nx	e, \rounds, \i0, \i1 +	.endm + +	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 +	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3 +	.endm + +	.macro		decrypt_block, in, rounds, t0, t1, t2 +	do_block_Nx	d, \rounds, \in +	.endm + +	.macro		decrypt_block2x, i0, i1, rounds, t0, t1, t2 +	do_block_Nx	d, \rounds, \i0, \i1 +	.endm + +	.macro		decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 +	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3 +	.endm + +#include "aes-modes.S" diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c new file mode 100644 index 00000000000..79cd911ef88 --- /dev/null +++ b/arch/arm64/crypto/aes-glue.c @@ -0,0 +1,446 @@ +/* + * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/hwcap.h> +#include <crypto/aes.h> +#include <crypto/ablk_helper.h> +#include <crypto/algapi.h> +#include <linux/module.h> +#include <linux/cpufeature.h> + +#ifdef USE_V8_CRYPTO_EXTENSIONS +#define MODE			"ce" +#define PRIO			300 +#define aes_ecb_encrypt		ce_aes_ecb_encrypt +#define aes_ecb_decrypt		ce_aes_ecb_decrypt +#define aes_cbc_encrypt		ce_aes_cbc_encrypt +#define aes_cbc_decrypt		ce_aes_cbc_decrypt +#define aes_ctr_encrypt		ce_aes_ctr_encrypt +#define aes_xts_encrypt		ce_aes_xts_encrypt +#define aes_xts_decrypt		ce_aes_xts_decrypt +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); +#else +#define MODE			"neon" +#define PRIO			200 +#define aes_ecb_encrypt		neon_aes_ecb_encrypt +#define aes_ecb_decrypt		neon_aes_ecb_decrypt +#define aes_cbc_encrypt		neon_aes_cbc_encrypt +#define aes_cbc_decrypt		neon_aes_cbc_decrypt +#define aes_ctr_encrypt		neon_aes_ctr_encrypt +#define aes_xts_encrypt		neon_aes_xts_encrypt +#define aes_xts_decrypt		neon_aes_xts_decrypt +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); +MODULE_ALIAS("ecb(aes)"); +MODULE_ALIAS("cbc(aes)"); +MODULE_ALIAS("ctr(aes)"); +MODULE_ALIAS("xts(aes)"); +#endif + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +/* defined in aes-modes.S */ +asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], +				int rounds, int blocks, int first); +asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], +				int rounds, int blocks, int first); + +asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], +				int rounds, int blocks, u8 iv[], int first); +asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], +				int rounds, int blocks, u8 iv[], int first); + +asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], +				int rounds, int blocks, u8 ctr[], int first); + +asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], +				int rounds, int blocks, u8 const rk2[], u8 iv[], +				int first); +asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], +				int rounds, int blocks, u8 const rk2[], u8 iv[], +				int first); + +struct crypto_aes_xts_ctx { +	struct crypto_aes_ctx key1; +	struct crypto_aes_ctx __aligned(8) key2; +}; + +static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, +		       unsigned int key_len) +{ +	struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); +	int ret; + +	ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); +	if (!ret) +		ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], +					    key_len / 2); +	if (!ret) +		return 0; + +	tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; +	return -EINVAL; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key_length / 4; +	struct blkcipher_walk walk; +	unsigned int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	kernel_neon_begin(); +	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { +		aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key_enc, rounds, blocks, first); +		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); +	} +	kernel_neon_end(); +	return err; +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key_length / 4; +	struct blkcipher_walk walk; +	unsigned int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	kernel_neon_begin(); +	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { +		aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key_dec, rounds, blocks, first); +		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); +	} +	kernel_neon_end(); +	return err; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key_length / 4; +	struct blkcipher_walk walk; +	unsigned int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	kernel_neon_begin(); +	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { +		aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key_enc, rounds, blocks, walk.iv, +				first); +		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); +	} +	kernel_neon_end(); +	return err; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key_length / 4; +	struct blkcipher_walk walk; +	unsigned int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	kernel_neon_begin(); +	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { +		aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key_dec, rounds, blocks, walk.iv, +				first); +		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); +	} +	kernel_neon_end(); +	return err; +} + +static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key_length / 4; +	struct blkcipher_walk walk; +	int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + +	first = 1; +	kernel_neon_begin(); +	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { +		aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key_enc, rounds, blocks, walk.iv, +				first); +		first = 0; +		nbytes -= blocks * AES_BLOCK_SIZE; +		if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) +			break; +		err = blkcipher_walk_done(desc, &walk, +					  walk.nbytes % AES_BLOCK_SIZE); +	} +	if (nbytes) { +		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; +		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; +		u8 __aligned(8) tail[AES_BLOCK_SIZE]; + +		/* +		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need +		 * to tell aes_ctr_encrypt() to only read half a block. +		 */ +		blocks = (nbytes <= 8) ? -1 : 1; + +		aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, +				blocks, walk.iv, first); +		memcpy(tdst, tail, nbytes); +		err = blkcipher_walk_done(desc, &walk, 0); +	} +	kernel_neon_end(); + +	return err; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key1.key_length / 4; +	struct blkcipher_walk walk; +	unsigned int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	kernel_neon_begin(); +	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { +		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key1.key_enc, rounds, blocks, +				(u8 *)ctx->key2.key_enc, walk.iv, first); +		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); +	} +	kernel_neon_end(); + +	return err; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	int err, first, rounds = 6 + ctx->key1.key_length / 4; +	struct blkcipher_walk walk; +	unsigned int blocks; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	kernel_neon_begin(); +	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { +		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, +				(u8 *)ctx->key1.key_dec, rounds, blocks, +				(u8 *)ctx->key2.key_enc, walk.iv, first); +		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); +	} +	kernel_neon_end(); + +	return err; +} + +static struct crypto_alg aes_algs[] = { { +	.cra_name		= "__ecb-aes-" MODE, +	.cra_driver_name	= "__driver-ecb-aes-" MODE, +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct crypto_aes_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_blkcipher = { +		.min_keysize	= AES_MIN_KEY_SIZE, +		.max_keysize	= AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= crypto_aes_set_key, +		.encrypt	= ecb_encrypt, +		.decrypt	= ecb_decrypt, +	}, +}, { +	.cra_name		= "__cbc-aes-" MODE, +	.cra_driver_name	= "__driver-cbc-aes-" MODE, +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct crypto_aes_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_blkcipher = { +		.min_keysize	= AES_MIN_KEY_SIZE, +		.max_keysize	= AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= crypto_aes_set_key, +		.encrypt	= cbc_encrypt, +		.decrypt	= cbc_decrypt, +	}, +}, { +	.cra_name		= "__ctr-aes-" MODE, +	.cra_driver_name	= "__driver-ctr-aes-" MODE, +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct crypto_aes_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_blkcipher = { +		.min_keysize	= AES_MIN_KEY_SIZE, +		.max_keysize	= AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= crypto_aes_set_key, +		.encrypt	= ctr_encrypt, +		.decrypt	= ctr_encrypt, +	}, +}, { +	.cra_name		= "__xts-aes-" MODE, +	.cra_driver_name	= "__driver-xts-aes-" MODE, +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_blkcipher = { +		.min_keysize	= 2 * AES_MIN_KEY_SIZE, +		.max_keysize	= 2 * AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= xts_set_key, +		.encrypt	= xts_encrypt, +		.decrypt	= xts_decrypt, +	}, +}, { +	.cra_name		= "ecb(aes)", +	.cra_driver_name	= "ecb-aes-" MODE, +	.cra_priority		= PRIO, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_ablkcipher = { +		.min_keysize	= AES_MIN_KEY_SIZE, +		.max_keysize	= AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= ablk_set_key, +		.encrypt	= ablk_encrypt, +		.decrypt	= ablk_decrypt, +	} +}, { +	.cra_name		= "cbc(aes)", +	.cra_driver_name	= "cbc-aes-" MODE, +	.cra_priority		= PRIO, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_ablkcipher = { +		.min_keysize	= AES_MIN_KEY_SIZE, +		.max_keysize	= AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= ablk_set_key, +		.encrypt	= ablk_encrypt, +		.decrypt	= ablk_decrypt, +	} +}, { +	.cra_name		= "ctr(aes)", +	.cra_driver_name	= "ctr-aes-" MODE, +	.cra_priority		= PRIO, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_ablkcipher = { +		.min_keysize	= AES_MIN_KEY_SIZE, +		.max_keysize	= AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= ablk_set_key, +		.encrypt	= ablk_encrypt, +		.decrypt	= ablk_decrypt, +	} +}, { +	.cra_name		= "xts(aes)", +	.cra_driver_name	= "xts-aes-" MODE, +	.cra_priority		= PRIO, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, +	.cra_blocksize		= AES_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 7, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_ablkcipher = { +		.min_keysize	= 2 * AES_MIN_KEY_SIZE, +		.max_keysize	= 2 * AES_MAX_KEY_SIZE, +		.ivsize		= AES_BLOCK_SIZE, +		.setkey		= ablk_set_key, +		.encrypt	= ablk_encrypt, +		.decrypt	= ablk_decrypt, +	} +} }; + +static int __init aes_init(void) +{ +	return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static void __exit aes_exit(void) +{ +	crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +#ifdef USE_V8_CRYPTO_EXTENSIONS +module_cpu_feature_match(AES, aes_init); +#else +module_init(aes_init); +#endif +module_exit(aes_exit); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S new file mode 100644 index 00000000000..f6e372c528e --- /dev/null +++ b/arch/arm64/crypto/aes-modes.S @@ -0,0 +1,532 @@ +/* + * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* included by aes-ce.S and aes-neon.S */ + +	.text +	.align		4 + +/* + * There are several ways to instantiate this code: + * - no interleave, all inline + * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) + * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) + * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) + * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) + * + * Macros imported by this code: + * - enc_prepare	- setup NEON registers for encryption + * - dec_prepare	- setup NEON registers for decryption + * - enc_switch_key	- change to new key after having prepared for encryption + * - encrypt_block	- encrypt a single block + * - decrypt block	- decrypt a single block + * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2) + * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2) + * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4) + * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4) + */ + +#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) +#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp +#define FRAME_POP	ldp x29, x30, [sp],#16 + +#if INTERLEAVE == 2 + +aes_encrypt_block2x: +	encrypt_block2x	v0, v1, w3, x2, x6, w7 +	ret +ENDPROC(aes_encrypt_block2x) + +aes_decrypt_block2x: +	decrypt_block2x	v0, v1, w3, x2, x6, w7 +	ret +ENDPROC(aes_decrypt_block2x) + +#elif INTERLEAVE == 4 + +aes_encrypt_block4x: +	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 +	ret +ENDPROC(aes_encrypt_block4x) + +aes_decrypt_block4x: +	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 +	ret +ENDPROC(aes_decrypt_block4x) + +#else +#error INTERLEAVE should equal 2 or 4 +#endif + +	.macro		do_encrypt_block2x +	bl		aes_encrypt_block2x +	.endm + +	.macro		do_decrypt_block2x +	bl		aes_decrypt_block2x +	.endm + +	.macro		do_encrypt_block4x +	bl		aes_encrypt_block4x +	.endm + +	.macro		do_decrypt_block4x +	bl		aes_decrypt_block4x +	.endm + +#else +#define FRAME_PUSH +#define FRAME_POP + +	.macro		do_encrypt_block2x +	encrypt_block2x	v0, v1, w3, x2, x6, w7 +	.endm + +	.macro		do_decrypt_block2x +	decrypt_block2x	v0, v1, w3, x2, x6, w7 +	.endm + +	.macro		do_encrypt_block4x +	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 +	.endm + +	.macro		do_decrypt_block4x +	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 +	.endm + +#endif + +	/* +	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, +	 *		   int blocks, int first) +	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, +	 *		   int blocks, int first) +	 */ + +AES_ENTRY(aes_ecb_encrypt) +	FRAME_PUSH +	cbz		w5, .LecbencloopNx + +	enc_prepare	w3, x2, x5 + +.LecbencloopNx: +#if INTERLEAVE >= 2 +	subs		w4, w4, #INTERLEAVE +	bmi		.Lecbenc1x +#if INTERLEAVE == 2 +	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */ +	do_encrypt_block2x +	st1		{v0.16b-v1.16b}, [x0], #32 +#else +	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */ +	do_encrypt_block4x +	st1		{v0.16b-v3.16b}, [x0], #64 +#endif +	b		.LecbencloopNx +.Lecbenc1x: +	adds		w4, w4, #INTERLEAVE +	beq		.Lecbencout +#endif +.Lecbencloop: +	ld1		{v0.16b}, [x1], #16		/* get next pt block */ +	encrypt_block	v0, w3, x2, x5, w6 +	st1		{v0.16b}, [x0], #16 +	subs		w4, w4, #1 +	bne		.Lecbencloop +.Lecbencout: +	FRAME_POP +	ret +AES_ENDPROC(aes_ecb_encrypt) + + +AES_ENTRY(aes_ecb_decrypt) +	FRAME_PUSH +	cbz		w5, .LecbdecloopNx + +	dec_prepare	w3, x2, x5 + +.LecbdecloopNx: +#if INTERLEAVE >= 2 +	subs		w4, w4, #INTERLEAVE +	bmi		.Lecbdec1x +#if INTERLEAVE == 2 +	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */ +	do_decrypt_block2x +	st1		{v0.16b-v1.16b}, [x0], #32 +#else +	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */ +	do_decrypt_block4x +	st1		{v0.16b-v3.16b}, [x0], #64 +#endif +	b		.LecbdecloopNx +.Lecbdec1x: +	adds		w4, w4, #INTERLEAVE +	beq		.Lecbdecout +#endif +.Lecbdecloop: +	ld1		{v0.16b}, [x1], #16		/* get next ct block */ +	decrypt_block	v0, w3, x2, x5, w6 +	st1		{v0.16b}, [x0], #16 +	subs		w4, w4, #1 +	bne		.Lecbdecloop +.Lecbdecout: +	FRAME_POP +	ret +AES_ENDPROC(aes_ecb_decrypt) + + +	/* +	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, +	 *		   int blocks, u8 iv[], int first) +	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, +	 *		   int blocks, u8 iv[], int first) +	 */ + +AES_ENTRY(aes_cbc_encrypt) +	cbz		w6, .Lcbcencloop + +	ld1		{v0.16b}, [x5]			/* get iv */ +	enc_prepare	w3, x2, x5 + +.Lcbcencloop: +	ld1		{v1.16b}, [x1], #16		/* get next pt block */ +	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */ +	encrypt_block	v0, w3, x2, x5, w6 +	st1		{v0.16b}, [x0], #16 +	subs		w4, w4, #1 +	bne		.Lcbcencloop +	ret +AES_ENDPROC(aes_cbc_encrypt) + + +AES_ENTRY(aes_cbc_decrypt) +	FRAME_PUSH +	cbz		w6, .LcbcdecloopNx + +	ld1		{v7.16b}, [x5]			/* get iv */ +	dec_prepare	w3, x2, x5 + +.LcbcdecloopNx: +#if INTERLEAVE >= 2 +	subs		w4, w4, #INTERLEAVE +	bmi		.Lcbcdec1x +#if INTERLEAVE == 2 +	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */ +	mov		v2.16b, v0.16b +	mov		v3.16b, v1.16b +	do_decrypt_block2x +	eor		v0.16b, v0.16b, v7.16b +	eor		v1.16b, v1.16b, v2.16b +	mov		v7.16b, v3.16b +	st1		{v0.16b-v1.16b}, [x0], #32 +#else +	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */ +	mov		v4.16b, v0.16b +	mov		v5.16b, v1.16b +	mov		v6.16b, v2.16b +	do_decrypt_block4x +	sub		x1, x1, #16 +	eor		v0.16b, v0.16b, v7.16b +	eor		v1.16b, v1.16b, v4.16b +	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */ +	eor		v2.16b, v2.16b, v5.16b +	eor		v3.16b, v3.16b, v6.16b +	st1		{v0.16b-v3.16b}, [x0], #64 +#endif +	b		.LcbcdecloopNx +.Lcbcdec1x: +	adds		w4, w4, #INTERLEAVE +	beq		.Lcbcdecout +#endif +.Lcbcdecloop: +	ld1		{v1.16b}, [x1], #16		/* get next ct block */ +	mov		v0.16b, v1.16b			/* ...and copy to v0 */ +	decrypt_block	v0, w3, x2, x5, w6 +	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */ +	mov		v7.16b, v1.16b			/* ct is next iv */ +	st1		{v0.16b}, [x0], #16 +	subs		w4, w4, #1 +	bne		.Lcbcdecloop +.Lcbcdecout: +	FRAME_POP +	ret +AES_ENDPROC(aes_cbc_decrypt) + + +	/* +	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, +	 *		   int blocks, u8 ctr[], int first) +	 */ + +AES_ENTRY(aes_ctr_encrypt) +	FRAME_PUSH +	cbnz		w6, .Lctrfirst		/* 1st time around? */ +	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */ +	rev		x5, x5 +#if INTERLEAVE >= 2 +	cmn		w5, w4			/* 32 bit overflow? */ +	bcs		.Lctrinc +	add		x5, x5, #1		/* increment BE ctr */ +	b		.LctrincNx +#else +	b		.Lctrinc +#endif +.Lctrfirst: +	enc_prepare	w3, x2, x6 +	ld1		{v4.16b}, [x5] +	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */ +	rev		x5, x5 +#if INTERLEAVE >= 2 +	cmn		w5, w4			/* 32 bit overflow? */ +	bcs		.Lctrloop +.LctrloopNx: +	subs		w4, w4, #INTERLEAVE +	bmi		.Lctr1x +#if INTERLEAVE == 2 +	mov		v0.8b, v4.8b +	mov		v1.8b, v4.8b +	rev		x7, x5 +	add		x5, x5, #1 +	ins		v0.d[1], x7 +	rev		x7, x5 +	add		x5, x5, #1 +	ins		v1.d[1], x7 +	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */ +	do_encrypt_block2x +	eor		v0.16b, v0.16b, v2.16b +	eor		v1.16b, v1.16b, v3.16b +	st1		{v0.16b-v1.16b}, [x0], #32 +#else +	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */ +	dup		v7.4s, w5 +	mov		v0.16b, v4.16b +	add		v7.4s, v7.4s, v8.4s +	mov		v1.16b, v4.16b +	rev32		v8.16b, v7.16b +	mov		v2.16b, v4.16b +	mov		v3.16b, v4.16b +	mov		v1.s[3], v8.s[0] +	mov		v2.s[3], v8.s[1] +	mov		v3.s[3], v8.s[2] +	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */ +	do_encrypt_block4x +	eor		v0.16b, v5.16b, v0.16b +	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */ +	eor		v1.16b, v6.16b, v1.16b +	eor		v2.16b, v7.16b, v2.16b +	eor		v3.16b, v5.16b, v3.16b +	st1		{v0.16b-v3.16b}, [x0], #64 +	add		x5, x5, #INTERLEAVE +#endif +	cbz		w4, .LctroutNx +.LctrincNx: +	rev		x7, x5 +	ins		v4.d[1], x7 +	b		.LctrloopNx +.LctroutNx: +	sub		x5, x5, #1 +	rev		x7, x5 +	ins		v4.d[1], x7 +	b		.Lctrout +.Lctr1x: +	adds		w4, w4, #INTERLEAVE +	beq		.Lctrout +#endif +.Lctrloop: +	mov		v0.16b, v4.16b +	encrypt_block	v0, w3, x2, x6, w7 +	subs		w4, w4, #1 +	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */ +	ld1		{v3.16b}, [x1], #16 +	eor		v3.16b, v0.16b, v3.16b +	st1		{v3.16b}, [x0], #16 +	beq		.Lctrout +.Lctrinc: +	adds		x5, x5, #1		/* increment BE ctr */ +	rev		x7, x5 +	ins		v4.d[1], x7 +	bcc		.Lctrloop		/* no overflow? */ +	umov		x7, v4.d[0]		/* load upper word of ctr  */ +	rev		x7, x7			/* ... to handle the carry */ +	add		x7, x7, #1 +	rev		x7, x7 +	ins		v4.d[0], x7 +	b		.Lctrloop +.Lctrhalfblock: +	ld1		{v3.8b}, [x1] +	eor		v3.8b, v0.8b, v3.8b +	st1		{v3.8b}, [x0] +.Lctrout: +	FRAME_POP +	ret +AES_ENDPROC(aes_ctr_encrypt) +	.ltorg + + +	/* +	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, +	 *		   int blocks, u8 const rk2[], u8 iv[], int first) +	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, +	 *		   int blocks, u8 const rk2[], u8 iv[], int first) +	 */ + +	.macro		next_tweak, out, in, const, tmp +	sshr		\tmp\().2d,  \in\().2d,   #63 +	and		\tmp\().16b, \tmp\().16b, \const\().16b +	add		\out\().2d,  \in\().2d,   \in\().2d +	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8 +	eor		\out\().16b, \out\().16b, \tmp\().16b +	.endm + +.Lxts_mul_x: +	.word		1, 0, 0x87, 0 + +AES_ENTRY(aes_xts_encrypt) +	FRAME_PUSH +	cbz		w7, .LxtsencloopNx + +	ld1		{v4.16b}, [x6] +	enc_prepare	w3, x5, x6 +	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */ +	enc_switch_key	w3, x2, x6 +	ldr		q7, .Lxts_mul_x +	b		.LxtsencNx + +.LxtsencloopNx: +	ldr		q7, .Lxts_mul_x +	next_tweak	v4, v4, v7, v8 +.LxtsencNx: +#if INTERLEAVE >= 2 +	subs		w4, w4, #INTERLEAVE +	bmi		.Lxtsenc1x +#if INTERLEAVE == 2 +	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */ +	next_tweak	v5, v4, v7, v8 +	eor		v0.16b, v0.16b, v4.16b +	eor		v1.16b, v1.16b, v5.16b +	do_encrypt_block2x +	eor		v0.16b, v0.16b, v4.16b +	eor		v1.16b, v1.16b, v5.16b +	st1		{v0.16b-v1.16b}, [x0], #32 +	cbz		w4, .LxtsencoutNx +	next_tweak	v4, v5, v7, v8 +	b		.LxtsencNx +.LxtsencoutNx: +	mov		v4.16b, v5.16b +	b		.Lxtsencout +#else +	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */ +	next_tweak	v5, v4, v7, v8 +	eor		v0.16b, v0.16b, v4.16b +	next_tweak	v6, v5, v7, v8 +	eor		v1.16b, v1.16b, v5.16b +	eor		v2.16b, v2.16b, v6.16b +	next_tweak	v7, v6, v7, v8 +	eor		v3.16b, v3.16b, v7.16b +	do_encrypt_block4x +	eor		v3.16b, v3.16b, v7.16b +	eor		v0.16b, v0.16b, v4.16b +	eor		v1.16b, v1.16b, v5.16b +	eor		v2.16b, v2.16b, v6.16b +	st1		{v0.16b-v3.16b}, [x0], #64 +	mov		v4.16b, v7.16b +	cbz		w4, .Lxtsencout +	b		.LxtsencloopNx +#endif +.Lxtsenc1x: +	adds		w4, w4, #INTERLEAVE +	beq		.Lxtsencout +#endif +.Lxtsencloop: +	ld1		{v1.16b}, [x1], #16 +	eor		v0.16b, v1.16b, v4.16b +	encrypt_block	v0, w3, x2, x6, w7 +	eor		v0.16b, v0.16b, v4.16b +	st1		{v0.16b}, [x0], #16 +	subs		w4, w4, #1 +	beq		.Lxtsencout +	next_tweak	v4, v4, v7, v8 +	b		.Lxtsencloop +.Lxtsencout: +	FRAME_POP +	ret +AES_ENDPROC(aes_xts_encrypt) + + +AES_ENTRY(aes_xts_decrypt) +	FRAME_PUSH +	cbz		w7, .LxtsdecloopNx + +	ld1		{v4.16b}, [x6] +	enc_prepare	w3, x5, x6 +	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */ +	dec_prepare	w3, x2, x6 +	ldr		q7, .Lxts_mul_x +	b		.LxtsdecNx + +.LxtsdecloopNx: +	ldr		q7, .Lxts_mul_x +	next_tweak	v4, v4, v7, v8 +.LxtsdecNx: +#if INTERLEAVE >= 2 +	subs		w4, w4, #INTERLEAVE +	bmi		.Lxtsdec1x +#if INTERLEAVE == 2 +	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */ +	next_tweak	v5, v4, v7, v8 +	eor		v0.16b, v0.16b, v4.16b +	eor		v1.16b, v1.16b, v5.16b +	do_decrypt_block2x +	eor		v0.16b, v0.16b, v4.16b +	eor		v1.16b, v1.16b, v5.16b +	st1		{v0.16b-v1.16b}, [x0], #32 +	cbz		w4, .LxtsdecoutNx +	next_tweak	v4, v5, v7, v8 +	b		.LxtsdecNx +.LxtsdecoutNx: +	mov		v4.16b, v5.16b +	b		.Lxtsdecout +#else +	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */ +	next_tweak	v5, v4, v7, v8 +	eor		v0.16b, v0.16b, v4.16b +	next_tweak	v6, v5, v7, v8 +	eor		v1.16b, v1.16b, v5.16b +	eor		v2.16b, v2.16b, v6.16b +	next_tweak	v7, v6, v7, v8 +	eor		v3.16b, v3.16b, v7.16b +	do_decrypt_block4x +	eor		v3.16b, v3.16b, v7.16b +	eor		v0.16b, v0.16b, v4.16b +	eor		v1.16b, v1.16b, v5.16b +	eor		v2.16b, v2.16b, v6.16b +	st1		{v0.16b-v3.16b}, [x0], #64 +	mov		v4.16b, v7.16b +	cbz		w4, .Lxtsdecout +	b		.LxtsdecloopNx +#endif +.Lxtsdec1x: +	adds		w4, w4, #INTERLEAVE +	beq		.Lxtsdecout +#endif +.Lxtsdecloop: +	ld1		{v1.16b}, [x1], #16 +	eor		v0.16b, v1.16b, v4.16b +	decrypt_block	v0, w3, x2, x6, w7 +	eor		v0.16b, v0.16b, v4.16b +	st1		{v0.16b}, [x0], #16 +	subs		w4, w4, #1 +	beq		.Lxtsdecout +	next_tweak	v4, v4, v7, v8 +	b		.Lxtsdecloop +.Lxtsdecout: +	FRAME_POP +	ret +AES_ENDPROC(aes_xts_decrypt) diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S new file mode 100644 index 00000000000..b93170e1cc9 --- /dev/null +++ b/arch/arm64/crypto/aes-neon.S @@ -0,0 +1,382 @@ +/* + * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#define AES_ENTRY(func)		ENTRY(neon_ ## func) +#define AES_ENDPROC(func)	ENDPROC(neon_ ## func) + +	/* multiply by polynomial 'x' in GF(2^8) */ +	.macro		mul_by_x, out, in, temp, const +	sshr		\temp, \in, #7 +	add		\out, \in, \in +	and		\temp, \temp, \const +	eor		\out, \out, \temp +	.endm + +	/* preload the entire Sbox */ +	.macro		prepare, sbox, shiftrows, temp +	adr		\temp, \sbox +	movi		v12.16b, #0x40 +	ldr		q13, \shiftrows +	movi		v14.16b, #0x1b +	ld1		{v16.16b-v19.16b}, [\temp], #64 +	ld1		{v20.16b-v23.16b}, [\temp], #64 +	ld1		{v24.16b-v27.16b}, [\temp], #64 +	ld1		{v28.16b-v31.16b}, [\temp] +	.endm + +	/* do preload for encryption */ +	.macro		enc_prepare, ignore0, ignore1, temp +	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp +	.endm + +	.macro		enc_switch_key, ignore0, ignore1, temp +	/* do nothing */ +	.endm + +	/* do preload for decryption */ +	.macro		dec_prepare, ignore0, ignore1, temp +	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp +	.endm + +	/* apply SubBytes transformation using the the preloaded Sbox */ +	.macro		sub_bytes, in +	sub		v9.16b, \in\().16b, v12.16b +	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b +	sub		v10.16b, v9.16b, v12.16b +	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b +	sub		v11.16b, v10.16b, v12.16b +	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b +	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b +	.endm + +	/* apply MixColumns transformation */ +	.macro		mix_columns, in +	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b +	rev32		v8.8h, \in\().8h +	eor		\in\().16b, v10.16b, \in\().16b +	shl		v9.4s, v8.4s, #24 +	shl		v11.4s, \in\().4s, #24 +	sri		v9.4s, v8.4s, #8 +	sri		v11.4s, \in\().4s, #8 +	eor		v9.16b, v9.16b, v8.16b +	eor		v10.16b, v10.16b, v9.16b +	eor		\in\().16b, v10.16b, v11.16b +	.endm + +	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ +	.macro		inv_mix_columns, in +	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b +	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b +	eor		\in\().16b, \in\().16b, v11.16b +	rev32		v11.8h, v11.8h +	eor		\in\().16b, \in\().16b, v11.16b +	mix_columns	\in +	.endm + +	.macro		do_block, enc, in, rounds, rk, rkp, i +	ld1		{v15.16b}, [\rk] +	add		\rkp, \rk, #16 +	mov		\i, \rounds +1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */ +	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */ +	sub_bytes	\in +	ld1		{v15.16b}, [\rkp], #16 +	subs		\i, \i, #1 +	beq		2222f +	.if		\enc == 1 +	mix_columns	\in +	.else +	inv_mix_columns	\in +	.endif +	b		1111b +2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */ +	.endm + +	.macro		encrypt_block, in, rounds, rk, rkp, i +	do_block	1, \in, \rounds, \rk, \rkp, \i +	.endm + +	.macro		decrypt_block, in, rounds, rk, rkp, i +	do_block	0, \in, \rounds, \rk, \rkp, \i +	.endm + +	/* +	 * Interleaved versions: functionally equivalent to the +	 * ones above, but applied to 2 or 4 AES states in parallel. +	 */ + +	.macro		sub_bytes_2x, in0, in1 +	sub		v8.16b, \in0\().16b, v12.16b +	sub		v9.16b, \in1\().16b, v12.16b +	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b +	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b +	sub		v10.16b, v8.16b, v12.16b +	sub		v11.16b, v9.16b, v12.16b +	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b +	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b +	sub		v8.16b, v10.16b, v12.16b +	sub		v9.16b, v11.16b, v12.16b +	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b +	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b +	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b +	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b +	.endm + +	.macro		sub_bytes_4x, in0, in1, in2, in3 +	sub		v8.16b, \in0\().16b, v12.16b +	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b +	sub		v9.16b, \in1\().16b, v12.16b +	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b +	sub		v10.16b, \in2\().16b, v12.16b +	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b +	sub		v11.16b, \in3\().16b, v12.16b +	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b +	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b +	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b +	sub		v8.16b, v8.16b, v12.16b +	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b +	sub		v9.16b, v9.16b, v12.16b +	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b +	sub		v10.16b, v10.16b, v12.16b +	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b +	sub		v11.16b, v11.16b, v12.16b +	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b +	sub		v8.16b, v8.16b, v12.16b +	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b +	sub		v9.16b, v9.16b, v12.16b +	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b +	sub		v10.16b, v10.16b, v12.16b +	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b +	sub		v11.16b, v11.16b, v12.16b +	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b +	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b +	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b +	.endm + +	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const +	sshr		\tmp0\().16b, \in0\().16b,  #7 +	add		\out0\().16b, \in0\().16b,  \in0\().16b +	sshr		\tmp1\().16b, \in1\().16b,  #7 +	and		\tmp0\().16b, \tmp0\().16b, \const\().16b +	add		\out1\().16b, \in1\().16b,  \in1\().16b +	and		\tmp1\().16b, \tmp1\().16b, \const\().16b +	eor		\out0\().16b, \out0\().16b, \tmp0\().16b +	eor		\out1\().16b, \out1\().16b, \tmp1\().16b +	.endm + +	.macro		mix_columns_2x, in0, in1 +	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14 +	rev32		v10.8h, \in0\().8h +	rev32		v11.8h, \in1\().8h +	eor		\in0\().16b, v8.16b, \in0\().16b +	eor		\in1\().16b, v9.16b, \in1\().16b +	shl		v12.4s, v10.4s, #24 +	shl		v13.4s, v11.4s, #24 +	eor		v8.16b, v8.16b, v10.16b +	sri		v12.4s, v10.4s, #8 +	shl		v10.4s, \in0\().4s, #24 +	eor		v9.16b, v9.16b, v11.16b +	sri		v13.4s, v11.4s, #8 +	shl		v11.4s, \in1\().4s, #24 +	sri		v10.4s, \in0\().4s, #8 +	eor		\in0\().16b, v8.16b, v12.16b +	sri		v11.4s, \in1\().4s, #8 +	eor		\in1\().16b, v9.16b, v13.16b +	eor		\in0\().16b, v10.16b, \in0\().16b +	eor		\in1\().16b, v11.16b, \in1\().16b +	.endm + +	.macro		inv_mix_cols_2x, in0, in1 +	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14 +	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14 +	eor		\in0\().16b, \in0\().16b, v8.16b +	eor		\in1\().16b, \in1\().16b, v9.16b +	rev32		v8.8h, v8.8h +	rev32		v9.8h, v9.8h +	eor		\in0\().16b, \in0\().16b, v8.16b +	eor		\in1\().16b, \in1\().16b, v9.16b +	mix_columns_2x	\in0, \in1 +	.endm + +	.macro		inv_mix_cols_4x, in0, in1, in2, in3 +	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14 +	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14 +	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14 +	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14 +	eor		\in0\().16b, \in0\().16b, v8.16b +	eor		\in1\().16b, \in1\().16b, v9.16b +	eor		\in2\().16b, \in2\().16b, v10.16b +	eor		\in3\().16b, \in3\().16b, v11.16b +	rev32		v8.8h, v8.8h +	rev32		v9.8h, v9.8h +	rev32		v10.8h, v10.8h +	rev32		v11.8h, v11.8h +	eor		\in0\().16b, \in0\().16b, v8.16b +	eor		\in1\().16b, \in1\().16b, v9.16b +	eor		\in2\().16b, \in2\().16b, v10.16b +	eor		\in3\().16b, \in3\().16b, v11.16b +	mix_columns_2x	\in0, \in1 +	mix_columns_2x	\in2, \in3 +	.endm + +	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i +	ld1		{v15.16b}, [\rk] +	add		\rkp, \rk, #16 +	mov		\i, \rounds +1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */ +	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */ +	sub_bytes_2x	\in0, \in1 +	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */ +	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */ +	ld1		{v15.16b}, [\rkp], #16 +	subs		\i, \i, #1 +	beq		2222f +	.if		\enc == 1 +	mix_columns_2x	\in0, \in1 +	ldr		q13, .LForward_ShiftRows +	.else +	inv_mix_cols_2x	\in0, \in1 +	ldr		q13, .LReverse_ShiftRows +	.endif +	movi		v12.16b, #0x40 +	b		1111b +2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */ +	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */ +	.endm + +	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i +	ld1		{v15.16b}, [\rk] +	add		\rkp, \rk, #16 +	mov		\i, \rounds +1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */ +	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */ +	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */ +	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */ +	sub_bytes_4x	\in0, \in1, \in2, \in3 +	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */ +	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */ +	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */ +	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */ +	ld1		{v15.16b}, [\rkp], #16 +	subs		\i, \i, #1 +	beq		2222f +	.if		\enc == 1 +	mix_columns_2x	\in0, \in1 +	mix_columns_2x	\in2, \in3 +	ldr		q13, .LForward_ShiftRows +	.else +	inv_mix_cols_4x	\in0, \in1, \in2, \in3 +	ldr		q13, .LReverse_ShiftRows +	.endif +	movi		v12.16b, #0x40 +	b		1111b +2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */ +	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */ +	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */ +	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */ +	.endm + +	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i +	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i +	.endm + +	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i +	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i +	.endm + +	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i +	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i +	.endm + +	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i +	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i +	.endm + +#include "aes-modes.S" + +	.text +	.align		4 +.LForward_ShiftRows: +	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 +	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb + +.LReverse_ShiftRows: +	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb +	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 + +.LForward_Sbox: +	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.LReverse_Sbox: +	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 00000000000..dc457015884 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -0,0 +1,79 @@ +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +	SHASH	.req	v0 +	SHASH2	.req	v1 +	T1	.req	v2 +	T2	.req	v3 +	MASK	.req	v4 +	XL	.req	v5 +	XM	.req	v6 +	XH	.req	v7 +	IN1	.req	v7 + +	.text +	.arch		armv8-a+crypto + +	/* +	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, +	 *			   struct ghash_key const *k, const char *head) +	 */ +ENTRY(pmull_ghash_update) +	ld1		{SHASH.16b}, [x3] +	ld1		{XL.16b}, [x1] +	movi		MASK.16b, #0xe1 +	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8 +	shl		MASK.2d, MASK.2d, #57 +	eor		SHASH2.16b, SHASH2.16b, SHASH.16b + +	/* do the head block first, if supplied */ +	cbz		x4, 0f +	ld1		{T1.2d}, [x4] +	b		1f + +0:	ld1		{T1.2d}, [x2], #16 +	sub		w0, w0, #1 + +1:	/* multiply XL by SHASH in GF(2^128) */ +CPU_LE(	rev64		T1.16b, T1.16b	) + +	ext		T2.16b, XL.16b, XL.16b, #8 +	ext		IN1.16b, T1.16b, T1.16b, #8 +	eor		T1.16b, T1.16b, T2.16b +	eor		XL.16b, XL.16b, IN1.16b + +	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1 +	eor		T1.16b, T1.16b, XL.16b +	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0 +	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0) + +	ext		T1.16b, XL.16b, XH.16b, #8 +	eor		T2.16b, XL.16b, XH.16b +	eor		XM.16b, XM.16b, T1.16b +	eor		XM.16b, XM.16b, T2.16b +	pmull		T2.1q, XL.1d, MASK.1d + +	mov		XH.d[0], XM.d[1] +	mov		XM.d[1], XL.d[0] + +	eor		XL.16b, XM.16b, T2.16b +	ext		T2.16b, XL.16b, XL.16b, #8 +	pmull		XL.1q, XL.1d, MASK.1d +	eor		T2.16b, T2.16b, XH.16b +	eor		XL.16b, XL.16b, T2.16b + +	cbnz		w0, 0b + +	st1		{XL.16b}, [x1] +	ret +ENDPROC(pmull_ghash_update) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c new file mode 100644 index 00000000000..833ec1e3f3e --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -0,0 +1,156 @@ +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +#define GHASH_BLOCK_SIZE	16 +#define GHASH_DIGEST_SIZE	16 + +struct ghash_key { +	u64 a; +	u64 b; +}; + +struct ghash_desc_ctx { +	u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; +	u8 buf[GHASH_BLOCK_SIZE]; +	u32 count; +}; + +asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, +				   struct ghash_key const *k, const char *head); + +static int ghash_init(struct shash_desc *desc) +{ +	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + +	*ctx = (struct ghash_desc_ctx){}; +	return 0; +} + +static int ghash_update(struct shash_desc *desc, const u8 *src, +			unsigned int len) +{ +	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); +	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + +	ctx->count += len; + +	if ((partial + len) >= GHASH_BLOCK_SIZE) { +		struct ghash_key *key = crypto_shash_ctx(desc->tfm); +		int blocks; + +		if (partial) { +			int p = GHASH_BLOCK_SIZE - partial; + +			memcpy(ctx->buf + partial, src, p); +			src += p; +			len -= p; +		} + +		blocks = len / GHASH_BLOCK_SIZE; +		len %= GHASH_BLOCK_SIZE; + +		kernel_neon_begin_partial(8); +		pmull_ghash_update(blocks, ctx->digest, src, key, +				   partial ? ctx->buf : NULL); +		kernel_neon_end(); +		src += blocks * GHASH_BLOCK_SIZE; +		partial = 0; +	} +	if (len) +		memcpy(ctx->buf + partial, src, len); +	return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ +	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); +	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + +	if (partial) { +		struct ghash_key *key = crypto_shash_ctx(desc->tfm); + +		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); + +		kernel_neon_begin_partial(8); +		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); +		kernel_neon_end(); +	} +	put_unaligned_be64(ctx->digest[1], dst); +	put_unaligned_be64(ctx->digest[0], dst + 8); + +	*ctx = (struct ghash_desc_ctx){}; +	return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, +			const u8 *inkey, unsigned int keylen) +{ +	struct ghash_key *key = crypto_shash_ctx(tfm); +	u64 a, b; + +	if (keylen != GHASH_BLOCK_SIZE) { +		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); +		return -EINVAL; +	} + +	/* perform multiplication by 'x' in GF(2^128) */ +	b = get_unaligned_be64(inkey); +	a = get_unaligned_be64(inkey + 8); + +	key->a = (a << 1) | (b >> 63); +	key->b = (b << 1) | (a >> 63); + +	if (b >> 63) +		key->b ^= 0xc200000000000000UL; + +	return 0; +} + +static struct shash_alg ghash_alg = { +	.digestsize	= GHASH_DIGEST_SIZE, +	.init		= ghash_init, +	.update		= ghash_update, +	.final		= ghash_final, +	.setkey		= ghash_setkey, +	.descsize	= sizeof(struct ghash_desc_ctx), +	.base		= { +		.cra_name		= "ghash", +		.cra_driver_name	= "ghash-ce", +		.cra_priority		= 200, +		.cra_flags		= CRYPTO_ALG_TYPE_SHASH, +		.cra_blocksize		= GHASH_BLOCK_SIZE, +		.cra_ctxsize		= sizeof(struct ghash_key), +		.cra_module		= THIS_MODULE, +	}, +}; + +static int __init ghash_ce_mod_init(void) +{ +	return crypto_register_shash(&ghash_alg); +} + +static void __exit ghash_ce_mod_exit(void) +{ +	crypto_unregister_shash(&ghash_alg); +} + +module_cpu_feature_match(PMULL, ghash_ce_mod_init); +module_exit(ghash_ce_mod_exit); diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S new file mode 100644 index 00000000000..09d57d98609 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -0,0 +1,153 @@ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +	.text +	.arch		armv8-a+crypto + +	k0		.req	v0 +	k1		.req	v1 +	k2		.req	v2 +	k3		.req	v3 + +	t0		.req	v4 +	t1		.req	v5 + +	dga		.req	q6 +	dgav		.req	v6 +	dgb		.req	s7 +	dgbv		.req	v7 + +	dg0q		.req	q12 +	dg0s		.req	s12 +	dg0v		.req	v12 +	dg1s		.req	s13 +	dg1v		.req	v13 +	dg2s		.req	s14 + +	.macro		add_only, op, ev, rc, s0, dg1 +	.ifc		\ev, ev +	add		t1.4s, v\s0\().4s, \rc\().4s +	sha1h		dg2s, dg0s +	.ifnb		\dg1 +	sha1\op		dg0q, \dg1, t0.4s +	.else +	sha1\op		dg0q, dg1s, t0.4s +	.endif +	.else +	.ifnb		\s0 +	add		t0.4s, v\s0\().4s, \rc\().4s +	.endif +	sha1h		dg1s, dg0s +	sha1\op		dg0q, dg2s, t1.4s +	.endif +	.endm + +	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1 +	sha1su0		v\s0\().4s, v\s1\().4s, v\s2\().4s +	add_only	\op, \ev, \rc, \s1, \dg1 +	sha1su1		v\s0\().4s, v\s3\().4s +	.endm + +	/* +	 * The SHA1 round constants +	 */ +	.align		4 +.Lsha1_rcon: +	.word		0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 + +	/* +	 * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, +	 * 			  u8 *head, long bytes) +	 */ +ENTRY(sha1_ce_transform) +	/* load round constants */ +	adr		x6, .Lsha1_rcon +	ld1r		{k0.4s}, [x6], #4 +	ld1r		{k1.4s}, [x6], #4 +	ld1r		{k2.4s}, [x6], #4 +	ld1r		{k3.4s}, [x6] + +	/* load state */ +	ldr		dga, [x2] +	ldr		dgb, [x2, #16] + +	/* load partial state (if supplied) */ +	cbz		x3, 0f +	ld1		{v8.4s-v11.4s}, [x3] +	b		1f + +	/* load input */ +0:	ld1		{v8.4s-v11.4s}, [x1], #64 +	sub		w0, w0, #1 + +1: +CPU_LE(	rev32		v8.16b, v8.16b		) +CPU_LE(	rev32		v9.16b, v9.16b		) +CPU_LE(	rev32		v10.16b, v10.16b	) +CPU_LE(	rev32		v11.16b, v11.16b	) + +2:	add		t0.4s, v8.4s, k0.4s +	mov		dg0v.16b, dgav.16b + +	add_update	c, ev, k0,  8,  9, 10, 11, dgb +	add_update	c, od, k0,  9, 10, 11,  8 +	add_update	c, ev, k0, 10, 11,  8,  9 +	add_update	c, od, k0, 11,  8,  9, 10 +	add_update	c, ev, k1,  8,  9, 10, 11 + +	add_update	p, od, k1,  9, 10, 11,  8 +	add_update	p, ev, k1, 10, 11,  8,  9 +	add_update	p, od, k1, 11,  8,  9, 10 +	add_update	p, ev, k1,  8,  9, 10, 11 +	add_update	p, od, k2,  9, 10, 11,  8 + +	add_update	m, ev, k2, 10, 11,  8,  9 +	add_update	m, od, k2, 11,  8,  9, 10 +	add_update	m, ev, k2,  8,  9, 10, 11 +	add_update	m, od, k2,  9, 10, 11,  8 +	add_update	m, ev, k3, 10, 11,  8,  9 + +	add_update	p, od, k3, 11,  8,  9, 10 +	add_only	p, ev, k3,  9 +	add_only	p, od, k3, 10 +	add_only	p, ev, k3, 11 +	add_only	p, od + +	/* update state */ +	add		dgbv.2s, dgbv.2s, dg1v.2s +	add		dgav.4s, dgav.4s, dg0v.4s + +	cbnz		w0, 0b + +	/* +	 * Final block: add padding and total bit count. +	 * Skip if we have no total byte count in x4. In that case, the input +	 * size was not a round multiple of the block size, and the padding is +	 * handled by the C code. +	 */ +	cbz		x4, 3f +	movi		v9.2d, #0 +	mov		x8, #0x80000000 +	movi		v10.2d, #0 +	ror		x7, x4, #29		// ror(lsl(x4, 3), 32) +	fmov		d8, x8 +	mov		x4, #0 +	mov		v11.d[0], xzr +	mov		v11.d[1], x7 +	b		2b + +	/* store new state */ +3:	str		dga, [x2] +	str		dgb, [x2, #16] +	ret +ENDPROC(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c new file mode 100644 index 00000000000..6fe83f37a75 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -0,0 +1,174 @@ +/* + * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, +				  u8 *head, long bytes); + +static int sha1_init(struct shash_desc *desc) +{ +	struct sha1_state *sctx = shash_desc_ctx(desc); + +	*sctx = (struct sha1_state){ +		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, +	}; +	return 0; +} + +static int sha1_update(struct shash_desc *desc, const u8 *data, +		       unsigned int len) +{ +	struct sha1_state *sctx = shash_desc_ctx(desc); +	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + +	sctx->count += len; + +	if ((partial + len) >= SHA1_BLOCK_SIZE) { +		int blocks; + +		if (partial) { +			int p = SHA1_BLOCK_SIZE - partial; + +			memcpy(sctx->buffer + partial, data, p); +			data += p; +			len -= p; +		} + +		blocks = len / SHA1_BLOCK_SIZE; +		len %= SHA1_BLOCK_SIZE; + +		kernel_neon_begin_partial(16); +		sha1_ce_transform(blocks, data, sctx->state, +				  partial ? sctx->buffer : NULL, 0); +		kernel_neon_end(); + +		data += blocks * SHA1_BLOCK_SIZE; +		partial = 0; +	} +	if (len) +		memcpy(sctx->buffer + partial, data, len); +	return 0; +} + +static int sha1_final(struct shash_desc *desc, u8 *out) +{ +	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + +	struct sha1_state *sctx = shash_desc_ctx(desc); +	__be64 bits = cpu_to_be64(sctx->count << 3); +	__be32 *dst = (__be32 *)out; +	int i; + +	u32 padlen = SHA1_BLOCK_SIZE +		     - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); + +	sha1_update(desc, padding, padlen); +	sha1_update(desc, (const u8 *)&bits, sizeof(bits)); + +	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) +		put_unaligned_be32(sctx->state[i], dst++); + +	*sctx = (struct sha1_state){}; +	return 0; +} + +static int sha1_finup(struct shash_desc *desc, const u8 *data, +		      unsigned int len, u8 *out) +{ +	struct sha1_state *sctx = shash_desc_ctx(desc); +	__be32 *dst = (__be32 *)out; +	int blocks; +	int i; + +	if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { +		sha1_update(desc, data, len); +		return sha1_final(desc, out); +	} + +	/* +	 * Use a fast path if the input is a multiple of 64 bytes. In +	 * this case, there is no need to copy data around, and we can +	 * perform the entire digest calculation in a single invocation +	 * of sha1_ce_transform() +	 */ +	blocks = len / SHA1_BLOCK_SIZE; + +	kernel_neon_begin_partial(16); +	sha1_ce_transform(blocks, data, sctx->state, NULL, len); +	kernel_neon_end(); + +	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) +		put_unaligned_be32(sctx->state[i], dst++); + +	*sctx = (struct sha1_state){}; +	return 0; +} + +static int sha1_export(struct shash_desc *desc, void *out) +{ +	struct sha1_state *sctx = shash_desc_ctx(desc); +	struct sha1_state *dst = out; + +	*dst = *sctx; +	return 0; +} + +static int sha1_import(struct shash_desc *desc, const void *in) +{ +	struct sha1_state *sctx = shash_desc_ctx(desc); +	struct sha1_state const *src = in; + +	*sctx = *src; +	return 0; +} + +static struct shash_alg alg = { +	.init			= sha1_init, +	.update			= sha1_update, +	.final			= sha1_final, +	.finup			= sha1_finup, +	.export			= sha1_export, +	.import			= sha1_import, +	.descsize		= sizeof(struct sha1_state), +	.digestsize		= SHA1_DIGEST_SIZE, +	.statesize		= sizeof(struct sha1_state), +	.base			= { +		.cra_name		= "sha1", +		.cra_driver_name	= "sha1-ce", +		.cra_priority		= 200, +		.cra_flags		= CRYPTO_ALG_TYPE_SHASH, +		.cra_blocksize		= SHA1_BLOCK_SIZE, +		.cra_module		= THIS_MODULE, +	} +}; + +static int __init sha1_ce_mod_init(void) +{ +	return crypto_register_shash(&alg); +} + +static void __exit sha1_ce_mod_fini(void) +{ +	crypto_unregister_shash(&alg); +} + +module_cpu_feature_match(SHA1, sha1_ce_mod_init); +module_exit(sha1_ce_mod_fini); diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S new file mode 100644 index 00000000000..7f29fc031ea --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -0,0 +1,156 @@ +/* + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +	.text +	.arch		armv8-a+crypto + +	dga		.req	q20 +	dgav		.req	v20 +	dgb		.req	q21 +	dgbv		.req	v21 + +	t0		.req	v22 +	t1		.req	v23 + +	dg0q		.req	q24 +	dg0v		.req	v24 +	dg1q		.req	q25 +	dg1v		.req	v25 +	dg2q		.req	q26 +	dg2v		.req	v26 + +	.macro		add_only, ev, rc, s0 +	mov		dg2v.16b, dg0v.16b +	.ifeq		\ev +	add		t1.4s, v\s0\().4s, \rc\().4s +	sha256h		dg0q, dg1q, t0.4s +	sha256h2	dg1q, dg2q, t0.4s +	.else +	.ifnb		\s0 +	add		t0.4s, v\s0\().4s, \rc\().4s +	.endif +	sha256h		dg0q, dg1q, t1.4s +	sha256h2	dg1q, dg2q, t1.4s +	.endif +	.endm + +	.macro		add_update, ev, rc, s0, s1, s2, s3 +	sha256su0	v\s0\().4s, v\s1\().4s +	add_only	\ev, \rc, \s1 +	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s +	.endm + +	/* +	 * The SHA-256 round constants +	 */ +	.align		4 +.Lsha2_rcon: +	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 +	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 +	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 +	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 +	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc +	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da +	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 +	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 +	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 +	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 +	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 +	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 +	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 +	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 +	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 +	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +	/* +	 * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, +	 *                        u8 *head, long bytes) +	 */ +ENTRY(sha2_ce_transform) +	/* load round constants */ +	adr		x8, .Lsha2_rcon +	ld1		{ v0.4s- v3.4s}, [x8], #64 +	ld1		{ v4.4s- v7.4s}, [x8], #64 +	ld1		{ v8.4s-v11.4s}, [x8], #64 +	ld1		{v12.4s-v15.4s}, [x8] + +	/* load state */ +	ldp		dga, dgb, [x2] + +	/* load partial input (if supplied) */ +	cbz		x3, 0f +	ld1		{v16.4s-v19.4s}, [x3] +	b		1f + +	/* load input */ +0:	ld1		{v16.4s-v19.4s}, [x1], #64 +	sub		w0, w0, #1 + +1: +CPU_LE(	rev32		v16.16b, v16.16b	) +CPU_LE(	rev32		v17.16b, v17.16b	) +CPU_LE(	rev32		v18.16b, v18.16b	) +CPU_LE(	rev32		v19.16b, v19.16b	) + +2:	add		t0.4s, v16.4s, v0.4s +	mov		dg0v.16b, dgav.16b +	mov		dg1v.16b, dgbv.16b + +	add_update	0,  v1, 16, 17, 18, 19 +	add_update	1,  v2, 17, 18, 19, 16 +	add_update	0,  v3, 18, 19, 16, 17 +	add_update	1,  v4, 19, 16, 17, 18 + +	add_update	0,  v5, 16, 17, 18, 19 +	add_update	1,  v6, 17, 18, 19, 16 +	add_update	0,  v7, 18, 19, 16, 17 +	add_update	1,  v8, 19, 16, 17, 18 + +	add_update	0,  v9, 16, 17, 18, 19 +	add_update	1, v10, 17, 18, 19, 16 +	add_update	0, v11, 18, 19, 16, 17 +	add_update	1, v12, 19, 16, 17, 18 + +	add_only	0, v13, 17 +	add_only	1, v14, 18 +	add_only	0, v15, 19 +	add_only	1 + +	/* update state */ +	add		dgav.4s, dgav.4s, dg0v.4s +	add		dgbv.4s, dgbv.4s, dg1v.4s + +	/* handled all input blocks? */ +	cbnz		w0, 0b + +	/* +	 * Final block: add padding and total bit count. +	 * Skip if we have no total byte count in x4. In that case, the input +	 * size was not a round multiple of the block size, and the padding is +	 * handled by the C code. +	 */ +	cbz		x4, 3f +	movi		v17.2d, #0 +	mov		x8, #0x80000000 +	movi		v18.2d, #0 +	ror		x7, x4, #29		// ror(lsl(x4, 3), 32) +	fmov		d16, x8 +	mov		x4, #0 +	mov		v19.d[0], xzr +	mov		v19.d[1], x7 +	b		2b + +	/* store new state */ +3:	stp		dga, dgb, [x2] +	ret +ENDPROC(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c new file mode 100644 index 00000000000..c294e67d392 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -0,0 +1,255 @@ +/* + * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, +				 u8 *head, long bytes); + +static int sha224_init(struct shash_desc *desc) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); + +	*sctx = (struct sha256_state){ +		.state = { +			SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, +			SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, +		} +	}; +	return 0; +} + +static int sha256_init(struct shash_desc *desc) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); + +	*sctx = (struct sha256_state){ +		.state = { +			SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, +			SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, +		} +	}; +	return 0; +} + +static int sha2_update(struct shash_desc *desc, const u8 *data, +		       unsigned int len) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; + +	sctx->count += len; + +	if ((partial + len) >= SHA256_BLOCK_SIZE) { +		int blocks; + +		if (partial) { +			int p = SHA256_BLOCK_SIZE - partial; + +			memcpy(sctx->buf + partial, data, p); +			data += p; +			len -= p; +		} + +		blocks = len / SHA256_BLOCK_SIZE; +		len %= SHA256_BLOCK_SIZE; + +		kernel_neon_begin_partial(28); +		sha2_ce_transform(blocks, data, sctx->state, +				  partial ? sctx->buf : NULL, 0); +		kernel_neon_end(); + +		data += blocks * SHA256_BLOCK_SIZE; +		partial = 0; +	} +	if (len) +		memcpy(sctx->buf + partial, data, len); +	return 0; +} + +static void sha2_final(struct shash_desc *desc) +{ +	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; + +	struct sha256_state *sctx = shash_desc_ctx(desc); +	__be64 bits = cpu_to_be64(sctx->count << 3); +	u32 padlen = SHA256_BLOCK_SIZE +		     - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); + +	sha2_update(desc, padding, padlen); +	sha2_update(desc, (const u8 *)&bits, sizeof(bits)); +} + +static int sha224_final(struct shash_desc *desc, u8 *out) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	__be32 *dst = (__be32 *)out; +	int i; + +	sha2_final(desc); + +	for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) +		put_unaligned_be32(sctx->state[i], dst++); + +	*sctx = (struct sha256_state){}; +	return 0; +} + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	__be32 *dst = (__be32 *)out; +	int i; + +	sha2_final(desc); + +	for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) +		put_unaligned_be32(sctx->state[i], dst++); + +	*sctx = (struct sha256_state){}; +	return 0; +} + +static void sha2_finup(struct shash_desc *desc, const u8 *data, +		       unsigned int len) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	int blocks; + +	if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { +		sha2_update(desc, data, len); +		sha2_final(desc); +		return; +	} + +	/* +	 * Use a fast path if the input is a multiple of 64 bytes. In +	 * this case, there is no need to copy data around, and we can +	 * perform the entire digest calculation in a single invocation +	 * of sha2_ce_transform() +	 */ +	blocks = len / SHA256_BLOCK_SIZE; + +	kernel_neon_begin_partial(28); +	sha2_ce_transform(blocks, data, sctx->state, NULL, len); +	kernel_neon_end(); +	data += blocks * SHA256_BLOCK_SIZE; +} + +static int sha224_finup(struct shash_desc *desc, const u8 *data, +			unsigned int len, u8 *out) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	__be32 *dst = (__be32 *)out; +	int i; + +	sha2_finup(desc, data, len); + +	for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) +		put_unaligned_be32(sctx->state[i], dst++); + +	*sctx = (struct sha256_state){}; +	return 0; +} + +static int sha256_finup(struct shash_desc *desc, const u8 *data, +			unsigned int len, u8 *out) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	__be32 *dst = (__be32 *)out; +	int i; + +	sha2_finup(desc, data, len); + +	for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) +		put_unaligned_be32(sctx->state[i], dst++); + +	*sctx = (struct sha256_state){}; +	return 0; +} + +static int sha2_export(struct shash_desc *desc, void *out) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	struct sha256_state *dst = out; + +	*dst = *sctx; +	return 0; +} + +static int sha2_import(struct shash_desc *desc, const void *in) +{ +	struct sha256_state *sctx = shash_desc_ctx(desc); +	struct sha256_state const *src = in; + +	*sctx = *src; +	return 0; +} + +static struct shash_alg algs[] = { { +	.init			= sha224_init, +	.update			= sha2_update, +	.final			= sha224_final, +	.finup			= sha224_finup, +	.export			= sha2_export, +	.import			= sha2_import, +	.descsize		= sizeof(struct sha256_state), +	.digestsize		= SHA224_DIGEST_SIZE, +	.statesize		= sizeof(struct sha256_state), +	.base			= { +		.cra_name		= "sha224", +		.cra_driver_name	= "sha224-ce", +		.cra_priority		= 200, +		.cra_flags		= CRYPTO_ALG_TYPE_SHASH, +		.cra_blocksize		= SHA256_BLOCK_SIZE, +		.cra_module		= THIS_MODULE, +	} +}, { +	.init			= sha256_init, +	.update			= sha2_update, +	.final			= sha256_final, +	.finup			= sha256_finup, +	.export			= sha2_export, +	.import			= sha2_import, +	.descsize		= sizeof(struct sha256_state), +	.digestsize		= SHA256_DIGEST_SIZE, +	.statesize		= sizeof(struct sha256_state), +	.base			= { +		.cra_name		= "sha256", +		.cra_driver_name	= "sha256-ce", +		.cra_priority		= 200, +		.cra_flags		= CRYPTO_ALG_TYPE_SHASH, +		.cra_blocksize		= SHA256_BLOCK_SIZE, +		.cra_module		= THIS_MODULE, +	} +} }; + +static int __init sha2_ce_mod_init(void) +{ +	return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha2_ce_mod_fini(void) +{ +	crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA2, sha2_ce_mod_init); +module_exit(sha2_ce_mod_fini);  | 
