diff options
Diffstat (limited to 'arch/arm64/lib')
| -rw-r--r-- | arch/arm64/lib/Makefile | 5 | ||||
| -rw-r--r-- | arch/arm64/lib/bitops.S | 69 | ||||
| -rw-r--r-- | arch/arm64/lib/clear_page.S | 39 | ||||
| -rw-r--r-- | arch/arm64/lib/clear_user.S | 58 | ||||
| -rw-r--r-- | arch/arm64/lib/copy_from_user.S | 66 | ||||
| -rw-r--r-- | arch/arm64/lib/copy_in_user.S | 63 | ||||
| -rw-r--r-- | arch/arm64/lib/copy_page.S | 46 | ||||
| -rw-r--r-- | arch/arm64/lib/copy_to_user.S | 61 | ||||
| -rw-r--r-- | arch/arm64/lib/delay.c | 55 | ||||
| -rw-r--r-- | arch/arm64/lib/memchr.S | 44 | ||||
| -rw-r--r-- | arch/arm64/lib/memcmp.S | 258 | ||||
| -rw-r--r-- | arch/arm64/lib/memcpy.S | 201 | ||||
| -rw-r--r-- | arch/arm64/lib/memmove.S | 197 | ||||
| -rw-r--r-- | arch/arm64/lib/memset.S | 216 | ||||
| -rw-r--r-- | arch/arm64/lib/strchr.S | 42 | ||||
| -rw-r--r-- | arch/arm64/lib/strcmp.S | 234 | ||||
| -rw-r--r-- | arch/arm64/lib/strlen.S | 126 | ||||
| -rw-r--r-- | arch/arm64/lib/strncmp.S | 310 | ||||
| -rw-r--r-- | arch/arm64/lib/strnlen.S | 171 | ||||
| -rw-r--r-- | arch/arm64/lib/strrchr.S | 43 | 
20 files changed, 2304 insertions, 0 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile new file mode 100644 index 00000000000..d98d3e39879 --- /dev/null +++ b/arch/arm64/lib/Makefile @@ -0,0 +1,5 @@ +lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\ +		   copy_to_user.o copy_in_user.o copy_page.o		\ +		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\ +		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\ +		   strchr.o strrchr.o diff --git a/arch/arm64/lib/bitops.S b/arch/arm64/lib/bitops.S new file mode 100644 index 00000000000..7dac371cc9a --- /dev/null +++ b/arch/arm64/lib/bitops.S @@ -0,0 +1,69 @@ +/* + * Based on arch/arm/lib/bitops.h + * + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * x0: bits 5:0  bit offset + *     bits 31:6 word offset + * x1: address + */ +	.macro	bitop, name, instr +ENTRY(	\name	) +	and	w3, w0, #63		// Get bit offset +	eor	w0, w0, w3		// Clear low bits +	mov	x2, #1 +	add	x1, x1, x0, lsr #3	// Get word offset +	lsl	x3, x2, x3		// Create mask +1:	ldxr	x2, [x1] +	\instr	x2, x2, x3 +	stxr	w0, x2, [x1] +	cbnz	w0, 1b +	ret +ENDPROC(\name	) +	.endm + +	.macro	testop, name, instr +ENTRY(	\name	) +	and	w3, w0, #63		// Get bit offset +	eor	w0, w0, w3		// Clear low bits +	mov	x2, #1 +	add	x1, x1, x0, lsr #3	// Get word offset +	lsl	x4, x2, x3		// Create mask +1:	ldxr	x2, [x1] +	lsr	x0, x2, x3		// Save old value of bit +	\instr	x2, x2, x4		// toggle bit +	stlxr	w5, x2, [x1] +	cbnz	w5, 1b +	dmb	ish +	and	x0, x0, #1 +3:	ret +ENDPROC(\name	) +	.endm + +/* + * Atomic bit operations. + */ +	bitop	change_bit, eor +	bitop	clear_bit, bic +	bitop	set_bit, orr + +	testop	test_and_change_bit, eor +	testop	test_and_clear_bit, bic +	testop	test_and_set_bit, orr diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S new file mode 100644 index 00000000000..ef08e905e35 --- /dev/null +++ b/arch/arm64/lib/clear_page.S @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> + +/* + * Clear page @dest + * + * Parameters: + *	x0 - dest + */ +ENTRY(clear_page) +	mrs	x1, dczid_el0 +	and	w1, w1, #0xf +	mov	x2, #4 +	lsl	x1, x2, x1 + +1:	dc	zva, x0 +	add	x0, x0, x1 +	tst	x0, #(PAGE_SIZE - 1) +	b.ne	1b +	ret +ENDPROC(clear_page) diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S new file mode 100644 index 00000000000..6e0ed93d51f --- /dev/null +++ b/arch/arm64/lib/clear_user.S @@ -0,0 +1,58 @@ +/* + * Based on arch/arm/lib/clear_user.S + * + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/linkage.h> +#include <asm/assembler.h> + +	.text + +/* Prototype: int __clear_user(void *addr, size_t sz) + * Purpose  : clear some user memory + * Params   : addr - user memory address to clear + *          : sz   - number of bytes to clear + * Returns  : number of bytes NOT cleared + * + * Alignment fixed up by hardware. + */ +ENTRY(__clear_user) +	mov	x2, x1			// save the size for fixup return +	subs	x1, x1, #8 +	b.mi	2f +1: +USER(9f, str	xzr, [x0], #8	) +	subs	x1, x1, #8 +	b.pl	1b +2:	adds	x1, x1, #4 +	b.mi	3f +USER(9f, str	wzr, [x0], #4	) +	sub	x1, x1, #4 +3:	adds	x1, x1, #2 +	b.mi	4f +USER(9f, strh	wzr, [x0], #2	) +	sub	x1, x1, #2 +4:	adds	x1, x1, #1 +	b.mi	5f +	strb	wzr, [x0] +5:	mov	x0, #0 +	ret +ENDPROC(__clear_user) + +	.section .fixup,"ax" +	.align	2 +9:	mov	x0, x2			// return the original size +	ret +	.previous diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S new file mode 100644 index 00000000000..5e27add9d36 --- /dev/null +++ b/arch/arm64/lib/copy_from_user.S @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Copy from user space to a kernel buffer (alignment handled by the hardware) + * + * Parameters: + *	x0 - to + *	x1 - from + *	x2 - n + * Returns: + *	x0 - bytes not copied + */ +ENTRY(__copy_from_user) +	add	x4, x1, x2			// upper user buffer boundary +	subs	x2, x2, #8 +	b.mi	2f +1: +USER(9f, ldr	x3, [x1], #8	) +	subs	x2, x2, #8 +	str	x3, [x0], #8 +	b.pl	1b +2:	adds	x2, x2, #4 +	b.mi	3f +USER(9f, ldr	w3, [x1], #4	) +	sub	x2, x2, #4 +	str	w3, [x0], #4 +3:	adds	x2, x2, #2 +	b.mi	4f +USER(9f, ldrh	w3, [x1], #2	) +	sub	x2, x2, #2 +	strh	w3, [x0], #2 +4:	adds	x2, x2, #1 +	b.mi	5f +USER(9f, ldrb	w3, [x1]	) +	strb	w3, [x0] +5:	mov	x0, #0 +	ret +ENDPROC(__copy_from_user) + +	.section .fixup,"ax" +	.align	2 +9:	sub	x2, x4, x1 +	mov	x3, x2 +10:	strb	wzr, [x0], #1			// zero remaining buffer space +	subs	x3, x3, #1 +	b.ne	10b +	mov	x0, x2				// bytes not copied +	ret +	.previous diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S new file mode 100644 index 00000000000..84b6c9bb9b9 --- /dev/null +++ b/arch/arm64/lib/copy_in_user.S @@ -0,0 +1,63 @@ +/* + * Copy from user space to user space + * + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Copy from user space to user space (alignment handled by the hardware) + * + * Parameters: + *	x0 - to + *	x1 - from + *	x2 - n + * Returns: + *	x0 - bytes not copied + */ +ENTRY(__copy_in_user) +	add	x4, x0, x2			// upper user buffer boundary +	subs	x2, x2, #8 +	b.mi	2f +1: +USER(9f, ldr	x3, [x1], #8	) +	subs	x2, x2, #8 +USER(9f, str	x3, [x0], #8	) +	b.pl	1b +2:	adds	x2, x2, #4 +	b.mi	3f +USER(9f, ldr	w3, [x1], #4	) +	sub	x2, x2, #4 +USER(9f, str	w3, [x0], #4	) +3:	adds	x2, x2, #2 +	b.mi	4f +USER(9f, ldrh	w3, [x1], #2	) +	sub	x2, x2, #2 +USER(9f, strh	w3, [x0], #2	) +4:	adds	x2, x2, #1 +	b.mi	5f +USER(9f, ldrb	w3, [x1]	) +USER(9f, strb	w3, [x0]	) +5:	mov	x0, #0 +	ret +ENDPROC(__copy_in_user) + +	.section .fixup,"ax" +	.align	2 +9:	sub	x0, x4, x0			// bytes not copied +	ret +	.previous diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S new file mode 100644 index 00000000000..512b9a7b980 --- /dev/null +++ b/arch/arm64/lib/copy_page.S @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> + +/* + * Copy a page from src to dest (both are page aligned) + * + * Parameters: + *	x0 - dest + *	x1 - src + */ +ENTRY(copy_page) +	/* Assume cache line size is 64 bytes. */ +	prfm	pldl1strm, [x1, #64] +1:	ldp	x2, x3, [x1] +	ldp	x4, x5, [x1, #16] +	ldp	x6, x7, [x1, #32] +	ldp	x8, x9, [x1, #48] +	add	x1, x1, #64 +	prfm	pldl1strm, [x1, #64] +	stnp	x2, x3, [x0] +	stnp	x4, x5, [x0, #16] +	stnp	x6, x7, [x0, #32] +	stnp	x8, x9, [x0, #48] +	add	x0, x0, #64 +	tst	x1, #(PAGE_SIZE - 1) +	b.ne	1b +	ret +ENDPROC(copy_page) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S new file mode 100644 index 00000000000..a0aeeb9b7a2 --- /dev/null +++ b/arch/arm64/lib/copy_to_user.S @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Copy to user space from a kernel buffer (alignment handled by the hardware) + * + * Parameters: + *	x0 - to + *	x1 - from + *	x2 - n + * Returns: + *	x0 - bytes not copied + */ +ENTRY(__copy_to_user) +	add	x4, x0, x2			// upper user buffer boundary +	subs	x2, x2, #8 +	b.mi	2f +1: +	ldr	x3, [x1], #8 +	subs	x2, x2, #8 +USER(9f, str	x3, [x0], #8	) +	b.pl	1b +2:	adds	x2, x2, #4 +	b.mi	3f +	ldr	w3, [x1], #4 +	sub	x2, x2, #4 +USER(9f, str	w3, [x0], #4	) +3:	adds	x2, x2, #2 +	b.mi	4f +	ldrh	w3, [x1], #2 +	sub	x2, x2, #2 +USER(9f, strh	w3, [x0], #2	) +4:	adds	x2, x2, #1 +	b.mi	5f +	ldrb	w3, [x1] +USER(9f, strb	w3, [x0]	) +5:	mov	x0, #0 +	ret +ENDPROC(__copy_to_user) + +	.section .fixup,"ax" +	.align	2 +9:	sub	x0, x4, x0			// bytes not copied +	ret +	.previous diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c new file mode 100644 index 00000000000..dad4ec9bbfd --- /dev/null +++ b/arch/arm64/lib/delay.c @@ -0,0 +1,55 @@ +/* + * Delay loops based on the OpenRISC implementation. + * + * Copyright (C) 2012 ARM Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/timex.h> + +void __delay(unsigned long cycles) +{ +	cycles_t start = get_cycles(); + +	while ((get_cycles() - start) < cycles) +		cpu_relax(); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ +	unsigned long loops; + +	loops = xloops * loops_per_jiffy * HZ; +	__delay(loops >> 32); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ +	__const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ +	__const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S new file mode 100644 index 00000000000..8636b754916 --- /dev/null +++ b/arch/arm64/lib/memchr.S @@ -0,0 +1,44 @@ +/* + * Based on arch/arm/lib/memchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find a character in an area of memory. + * + * Parameters: + *	x0 - buf + *	x1 - c + *	x2 - n + * Returns: + *	x0 - address of first occurrence of 'c' or 0 + */ +ENTRY(memchr) +	and	w1, w1, #0xff +1:	subs	x2, x2, #1 +	b.mi	2f +	ldrb	w3, [x0], #1 +	cmp	w3, w1 +	b.ne	1b +	sub	x0, x0, #1 +	ret +2:	mov	x0, #0 +	ret +ENDPROC(memchr) diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 00000000000..6ea0776ba6d --- /dev/null +++ b/arch/arm64/lib/memcmp.S @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* +* compare memory areas(when two memory areas' offset are different, +* alignment handled by the hardware) +* +* Parameters: +*  x0 - const memory area 1 pointer +*  x1 - const memory area 2 pointer +*  x2 - the maximal compare byte length +* Returns: +*  x0 - a compare result, maybe less than, equal to, or greater than ZERO +*/ + +/* Parameters and result.  */ +src1		.req	x0 +src2		.req	x1 +limit		.req	x2 +result		.req	x0 + +/* Internal variables.  */ +data1		.req	x3 +data1w		.req	w3 +data2		.req	x4 +data2w		.req	w4 +has_nul		.req	x5 +diff		.req	x6 +endloop		.req	x7 +tmp1		.req	x8 +tmp2		.req	x9 +tmp3		.req	x10 +pos		.req	x11 +limit_wd	.req	x12 +mask		.req	x13 + +ENTRY(memcmp) +	cbz	limit, .Lret0 +	eor	tmp1, src1, src2 +	tst	tmp1, #7 +	b.ne	.Lmisaligned8 +	ands	tmp1, src1, #7 +	b.ne	.Lmutual_align +	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */ +	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */ +	/* +	* The input source addresses are at alignment boundary. +	* Directly compare eight bytes each time. +	*/ +.Lloop_aligned: +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +.Lstart_realigned: +	subs	limit_wd, limit_wd, #1 +	eor	diff, data1, data2	/* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */ +	cbz	endloop, .Lloop_aligned + +	/* Not reached the limit, must have found a diff.  */ +	tbz	limit_wd, #63, .Lnot_limit + +	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */ +	ands	limit, limit, #7 +	b.eq	.Lnot_limit +	/* +	* The remained bytes less than 8. It is needed to extract valid data +	* from last eight bytes of the intended memory range. +	*/ +	lsl	limit, limit, #3	/* bytes-> bits.  */ +	mov	mask, #~0 +CPU_BE( lsr	mask, mask, limit ) +CPU_LE( lsl	mask, mask, limit ) +	bic	data1, data1, mask +	bic	data2, data2, mask + +	orr	diff, diff, mask +	b	.Lnot_limit + +.Lmutual_align: +	/* +	* Sources are mutually aligned, but are not currently at an +	* alignment boundary. Round down the addresses and then mask off +	* the bytes that precede the start point. +	*/ +	bic	src1, src1, #7 +	bic	src2, src2, #7 +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	/* +	* We can not add limit with alignment offset(tmp1) here. Since the +	* addition probably make the limit overflown. +	*/ +	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/ +	and	tmp3, limit_wd, #7 +	lsr	limit_wd, limit_wd, #3 +	add	tmp3, tmp3, tmp1 +	add	limit_wd, limit_wd, tmp3, lsr #3 +	add	limit, limit, tmp1/* Adjust the limit for the extra.  */ + +	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ +	neg	tmp1, tmp1/* Bits to alignment -64.  */ +	mov	tmp2, #~0 +	/*mask off the non-intended bytes before the start address.*/ +CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp1 ) + +	orr	data1, data1, tmp2 +	orr	data2, data2, tmp2 +	b	.Lstart_realigned + +	/*src1 and src2 have different alignment offset.*/ +.Lmisaligned8: +	cmp	limit, #8 +	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/ + +	and	tmp1, src1, #7 +	neg	tmp1, tmp1 +	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ +	and	tmp2, src2, #7 +	neg	tmp2, tmp2 +	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ +	subs	tmp3, tmp1, tmp2 +	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/ + +	sub	limit, limit, pos +	/*compare the proceeding bytes in the first 8 byte segment.*/ +.Ltinycmp: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	pos, pos, #1 +	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */ +	b.eq	.Ltinycmp +	cbnz	pos, 1f /*diff occurred before the last byte.*/ +	cmp	data1w, data2w +	b.eq	.Lstart_align +1: +	sub	result, data1, data2 +	ret + +.Lstart_align: +	lsr	limit_wd, limit, #3 +	cbz	limit_wd, .Lremain8 + +	ands	xzr, src1, #7 +	b.eq	.Lrecal_offset +	/*process more leading bytes to make src1 aligned...*/ +	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/ +	add	src2, src2, tmp3 +	sub	limit, limit, tmp3 +	lsr	limit_wd, limit, #3 +	cbz	limit_wd, .Lremain8 +	/*load 8 bytes from aligned SRC1..*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 + +	subs	limit_wd, limit_wd, #1 +	eor	diff, data1, data2  /*Non-zero if differences found.*/ +	csinv	endloop, diff, xzr, ne +	cbnz	endloop, .Lunequal_proc +	/*How far is the current SRC2 from the alignment boundary...*/ +	and	tmp3, tmp3, #7 + +.Lrecal_offset:/*src1 is aligned now..*/ +	neg	pos, tmp3 +.Lloopcmp_proc: +	/* +	* Divide the eight bytes into two parts. First,backwards the src2 +	* to an alignment boundary,load eight bytes and compare from +	* the SRC2 alignment boundary. If all 8 bytes are equal,then start +	* the second part's comparison. Otherwise finish the comparison. +	* This special handle can garantee all the accesses are in the +	* thread/task space in avoid to overrange access. +	*/ +	ldr	data1, [src1,pos] +	ldr	data2, [src2,pos] +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	cbnz	diff, .Lnot_limit + +	/*The second part process*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	subs	limit_wd, limit_wd, #1 +	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ +	cbz	endloop, .Lloopcmp_proc +.Lunequal_proc: +	cbz	diff, .Lremain8 + +/*There is differnence occured in the latest comparison.*/ +.Lnot_limit: +/* +* For little endian,reverse the low significant equal bits into MSB,then +* following CLZ can find how many equal bits exist. +*/ +CPU_LE( rev	diff, diff ) +CPU_LE( rev	data1, data1 ) +CPU_LE( rev	data2, data2 ) + +	/* +	* The MS-non-zero bit of DIFF marks either the first bit +	* that is different, or the end of the significant data. +	* Shifting left now will bring the critical information into the +	* top bits. +	*/ +	clz	pos, diff +	lsl	data1, data1, pos +	lsl	data2, data2, pos +	/* +	* We need to zero-extend (char is unsigned) the value and then +	* perform a signed subtraction. +	*/ +	lsr	data1, data1, #56 +	sub	result, data1, data2, lsr #56 +	ret + +.Lremain8: +	/* Limit % 8 == 0 =>. all data are equal.*/ +	ands	limit, limit, #7 +	b.eq	.Lret0 + +.Ltiny8proc: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	limit, limit, #1 + +	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */ +	b.eq	.Ltiny8proc +	sub	result, data1, data2 +	ret +.Lret0: +	mov	result, #0 +	ret +ENDPROC(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S new file mode 100644 index 00000000000..8a9a96d3dda --- /dev/null +++ b/arch/arm64/lib/memcpy.S @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + *	x0 - dest + *	x1 - src + *	x2 - n + * Returns: + *	x0 - dest + */ +dstin	.req	x0 +src	.req	x1 +count	.req	x2 +tmp1	.req	x3 +tmp1w	.req	w3 +tmp2	.req	x4 +tmp2w	.req	w4 +tmp3	.req	x5 +tmp3w	.req	w5 +dst	.req	x6 + +A_l	.req	x7 +A_h	.req	x8 +B_l	.req	x9 +B_h	.req	x10 +C_l	.req	x11 +C_h	.req	x12 +D_l	.req	x13 +D_h	.req	x14 + +ENTRY(memcpy) +	mov	dst, dstin +	cmp	count, #16 +	/*When memory length is less than 16, the accessed are not aligned.*/ +	b.lo	.Ltiny15 + +	neg	tmp2, src +	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */ +	b.eq	.LSrcAligned +	sub	count, count, tmp2 +	/* +	* Copy the leading memory data from src to dst in an increasing +	* address order.By this way,the risk of overwritting the source +	* memory data is eliminated when the distance between src and +	* dst is less than 16. The memory accesses here are alignment. +	*/ +	tbz	tmp2, #0, 1f +	ldrb	tmp1w, [src], #1 +	strb	tmp1w, [dst], #1 +1: +	tbz	tmp2, #1, 2f +	ldrh	tmp1w, [src], #2 +	strh	tmp1w, [dst], #2 +2: +	tbz	tmp2, #2, 3f +	ldr	tmp1w, [src], #4 +	str	tmp1w, [dst], #4 +3: +	tbz	tmp2, #3, .LSrcAligned +	ldr	tmp1, [src],#8 +	str	tmp1, [dst],#8 + +.LSrcAligned: +	cmp	count, #64 +	b.ge	.Lcpy_over64 +	/* +	* Deal with small copies quickly by dropping straight into the +	* exit block. +	*/ +.Ltail63: +	/* +	* Copy up to 48 bytes of data. At this point we only need the +	* bottom 6 bits of count to be accurate. +	*/ +	ands	tmp1, count, #0x30 +	b.eq	.Ltiny15 +	cmp	tmp1w, #0x20 +	b.eq	1f +	b.lt	2f +	ldp	A_l, A_h, [src], #16 +	stp	A_l, A_h, [dst], #16 +1: +	ldp	A_l, A_h, [src], #16 +	stp	A_l, A_h, [dst], #16 +2: +	ldp	A_l, A_h, [src], #16 +	stp	A_l, A_h, [dst], #16 +.Ltiny15: +	/* +	* Prefer to break one ldp/stp into several load/store to access +	* memory in an increasing address order,rather than to load/store 16 +	* bytes from (src-16) to (dst-16) and to backward the src to aligned +	* address,which way is used in original cortex memcpy. If keeping +	* the original memcpy process here, memmove need to satisfy the +	* precondition that src address is at least 16 bytes bigger than dst +	* address,otherwise some source data will be overwritten when memove +	* call memcpy directly. To make memmove simpler and decouple the +	* memcpy's dependency on memmove, withdrew the original process. +	*/ +	tbz	count, #3, 1f +	ldr	tmp1, [src], #8 +	str	tmp1, [dst], #8 +1: +	tbz	count, #2, 2f +	ldr	tmp1w, [src], #4 +	str	tmp1w, [dst], #4 +2: +	tbz	count, #1, 3f +	ldrh	tmp1w, [src], #2 +	strh	tmp1w, [dst], #2 +3: +	tbz	count, #0, .Lexitfunc +	ldrb	tmp1w, [src] +	strb	tmp1w, [dst] + +.Lexitfunc: +	ret + +.Lcpy_over64: +	subs	count, count, #128 +	b.ge	.Lcpy_body_large +	/* +	* Less than 128 bytes to copy, so handle 64 here and then jump +	* to the tail. +	*/ +	ldp	A_l, A_h, [src],#16 +	stp	A_l, A_h, [dst],#16 +	ldp	B_l, B_h, [src],#16 +	ldp	C_l, C_h, [src],#16 +	stp	B_l, B_h, [dst],#16 +	stp	C_l, C_h, [dst],#16 +	ldp	D_l, D_h, [src],#16 +	stp	D_l, D_h, [dst],#16 + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret + +	/* +	* Critical loop.  Start at a new cache line boundary.  Assuming +	* 64 bytes per line this ensures the entire loop is in one line. +	*/ +	.p2align	L1_CACHE_SHIFT +.Lcpy_body_large: +	/* pre-get 64 bytes data. */ +	ldp	A_l, A_h, [src],#16 +	ldp	B_l, B_h, [src],#16 +	ldp	C_l, C_h, [src],#16 +	ldp	D_l, D_h, [src],#16 +1: +	/* +	* interlace the load of next 64 bytes data block with store of the last +	* loaded 64 bytes data. +	*/ +	stp	A_l, A_h, [dst],#16 +	ldp	A_l, A_h, [src],#16 +	stp	B_l, B_h, [dst],#16 +	ldp	B_l, B_h, [src],#16 +	stp	C_l, C_h, [dst],#16 +	ldp	C_l, C_h, [src],#16 +	stp	D_l, D_h, [dst],#16 +	ldp	D_l, D_h, [src],#16 +	subs	count, count, #64 +	b.ge	1b +	stp	A_l, A_h, [dst],#16 +	stp	B_l, B_h, [dst],#16 +	stp	C_l, C_h, [dst],#16 +	stp	D_l, D_h, [dst],#16 + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret +ENDPROC(memcpy) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S new file mode 100644 index 00000000000..57b19ea2dad --- /dev/null +++ b/arch/arm64/lib/memmove.S @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Move a buffer from src to test (alignment handled by the hardware). + * If dest <= src, call memcpy, otherwise copy in reverse order. + * + * Parameters: + *	x0 - dest + *	x1 - src + *	x2 - n + * Returns: + *	x0 - dest + */ +dstin	.req	x0 +src	.req	x1 +count	.req	x2 +tmp1	.req	x3 +tmp1w	.req	w3 +tmp2	.req	x4 +tmp2w	.req	w4 +tmp3	.req	x5 +tmp3w	.req	w5 +dst	.req	x6 + +A_l	.req	x7 +A_h	.req	x8 +B_l	.req	x9 +B_h	.req	x10 +C_l	.req	x11 +C_h	.req	x12 +D_l	.req	x13 +D_h	.req	x14 + +ENTRY(memmove) +	cmp	dstin, src +	b.lo	memcpy +	add	tmp1, src, count +	cmp	dstin, tmp1 +	b.hs	memcpy		/* No overlap.  */ + +	add	dst, dstin, count +	add	src, src, count +	cmp	count, #16 +	b.lo	.Ltail15  /*probably non-alignment accesses.*/ + +	ands	tmp2, src, #15     /* Bytes to reach alignment.  */ +	b.eq	.LSrcAligned +	sub	count, count, tmp2 +	/* +	* process the aligned offset length to make the src aligned firstly. +	* those extra instructions' cost is acceptable. It also make the +	* coming accesses are based on aligned address. +	*/ +	tbz	tmp2, #0, 1f +	ldrb	tmp1w, [src, #-1]! +	strb	tmp1w, [dst, #-1]! +1: +	tbz	tmp2, #1, 2f +	ldrh	tmp1w, [src, #-2]! +	strh	tmp1w, [dst, #-2]! +2: +	tbz	tmp2, #2, 3f +	ldr	tmp1w, [src, #-4]! +	str	tmp1w, [dst, #-4]! +3: +	tbz	tmp2, #3, .LSrcAligned +	ldr	tmp1, [src, #-8]! +	str	tmp1, [dst, #-8]! + +.LSrcAligned: +	cmp	count, #64 +	b.ge	.Lcpy_over64 + +	/* +	* Deal with small copies quickly by dropping straight into the +	* exit block. +	*/ +.Ltail63: +	/* +	* Copy up to 48 bytes of data. At this point we only need the +	* bottom 6 bits of count to be accurate. +	*/ +	ands	tmp1, count, #0x30 +	b.eq	.Ltail15 +	cmp	tmp1w, #0x20 +	b.eq	1f +	b.lt	2f +	ldp	A_l, A_h, [src, #-16]! +	stp	A_l, A_h, [dst, #-16]! +1: +	ldp	A_l, A_h, [src, #-16]! +	stp	A_l, A_h, [dst, #-16]! +2: +	ldp	A_l, A_h, [src, #-16]! +	stp	A_l, A_h, [dst, #-16]! + +.Ltail15: +	tbz	count, #3, 1f +	ldr	tmp1, [src, #-8]! +	str	tmp1, [dst, #-8]! +1: +	tbz	count, #2, 2f +	ldr	tmp1w, [src, #-4]! +	str	tmp1w, [dst, #-4]! +2: +	tbz	count, #1, 3f +	ldrh	tmp1w, [src, #-2]! +	strh	tmp1w, [dst, #-2]! +3: +	tbz	count, #0, .Lexitfunc +	ldrb	tmp1w, [src, #-1] +	strb	tmp1w, [dst, #-1] + +.Lexitfunc: +	ret + +.Lcpy_over64: +	subs	count, count, #128 +	b.ge	.Lcpy_body_large +	/* +	* Less than 128 bytes to copy, so handle 64 bytes here and then jump +	* to the tail. +	*/ +	ldp	A_l, A_h, [src, #-16] +	stp	A_l, A_h, [dst, #-16] +	ldp	B_l, B_h, [src, #-32] +	ldp	C_l, C_h, [src, #-48] +	stp	B_l, B_h, [dst, #-32] +	stp	C_l, C_h, [dst, #-48] +	ldp	D_l, D_h, [src, #-64]! +	stp	D_l, D_h, [dst, #-64]! + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret + +	/* +	* Critical loop. Start at a new cache line boundary. Assuming +	* 64 bytes per line this ensures the entire loop is in one line. +	*/ +	.p2align	L1_CACHE_SHIFT +.Lcpy_body_large: +	/* pre-load 64 bytes data. */ +	ldp	A_l, A_h, [src, #-16] +	ldp	B_l, B_h, [src, #-32] +	ldp	C_l, C_h, [src, #-48] +	ldp	D_l, D_h, [src, #-64]! +1: +	/* +	* interlace the load of next 64 bytes data block with store of the last +	* loaded 64 bytes data. +	*/ +	stp	A_l, A_h, [dst, #-16] +	ldp	A_l, A_h, [src, #-16] +	stp	B_l, B_h, [dst, #-32] +	ldp	B_l, B_h, [src, #-32] +	stp	C_l, C_h, [dst, #-48] +	ldp	C_l, C_h, [src, #-48] +	stp	D_l, D_h, [dst, #-64]! +	ldp	D_l, D_h, [src, #-64]! +	subs	count, count, #64 +	b.ge	1b +	stp	A_l, A_h, [dst, #-16] +	stp	B_l, B_h, [dst, #-32] +	stp	C_l, C_h, [dst, #-48] +	stp	D_l, D_h, [dst, #-64]! + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret +ENDPROC(memmove) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S new file mode 100644 index 00000000000..7c72dfd36b6 --- /dev/null +++ b/arch/arm64/lib/memset.S @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Fill in the buffer with character c (alignment handled by the hardware) + * + * Parameters: + *	x0 - buf + *	x1 - c + *	x2 - n + * Returns: + *	x0 - buf + */ + +dstin		.req	x0 +val		.req	w1 +count		.req	x2 +tmp1		.req	x3 +tmp1w		.req	w3 +tmp2		.req	x4 +tmp2w		.req	w4 +zva_len_x	.req	x5 +zva_len		.req	w5 +zva_bits_x	.req	x6 + +A_l		.req	x7 +A_lw		.req	w7 +dst		.req	x8 +tmp3w		.req	w9 +tmp3		.req	x9 + +ENTRY(memset) +	mov	dst, dstin	/* Preserve return value.  */ +	and	A_lw, val, #255 +	orr	A_lw, A_lw, A_lw, lsl #8 +	orr	A_lw, A_lw, A_lw, lsl #16 +	orr	A_l, A_l, A_l, lsl #32 + +	cmp	count, #15 +	b.hi	.Lover16_proc +	/*All store maybe are non-aligned..*/ +	tbz	count, #3, 1f +	str	A_l, [dst], #8 +1: +	tbz	count, #2, 2f +	str	A_lw, [dst], #4 +2: +	tbz	count, #1, 3f +	strh	A_lw, [dst], #2 +3: +	tbz	count, #0, 4f +	strb	A_lw, [dst] +4: +	ret + +.Lover16_proc: +	/*Whether  the start address is aligned with 16.*/ +	neg	tmp2, dst +	ands	tmp2, tmp2, #15 +	b.eq	.Laligned +/* +* The count is not less than 16, we can use stp to store the start 16 bytes, +* then adjust the dst aligned with 16.This process will make the current +* memory address at alignment boundary. +*/ +	stp	A_l, A_l, [dst] /*non-aligned store..*/ +	/*make the dst aligned..*/ +	sub	count, count, tmp2 +	add	dst, dst, tmp2 + +.Laligned: +	cbz	A_l, .Lzero_mem + +.Ltail_maybe_long: +	cmp	count, #64 +	b.ge	.Lnot_short +.Ltail63: +	ands	tmp1, count, #0x30 +	b.eq	3f +	cmp	tmp1w, #0x20 +	b.eq	1f +	b.lt	2f +	stp	A_l, A_l, [dst], #16 +1: +	stp	A_l, A_l, [dst], #16 +2: +	stp	A_l, A_l, [dst], #16 +/* +* The last store length is less than 16,use stp to write last 16 bytes. +* It will lead some bytes written twice and the access is non-aligned. +*/ +3: +	ands	count, count, #15 +	cbz	count, 4f +	add	dst, dst, count +	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */ +4: +	ret + +	/* +	* Critical loop. Start at a new cache line boundary. Assuming +	* 64 bytes per line, this ensures the entire loop is in one line. +	*/ +	.p2align	L1_CACHE_SHIFT +.Lnot_short: +	sub	dst, dst, #16/* Pre-bias.  */ +	sub	count, count, #64 +1: +	stp	A_l, A_l, [dst, #16] +	stp	A_l, A_l, [dst, #32] +	stp	A_l, A_l, [dst, #48] +	stp	A_l, A_l, [dst, #64]! +	subs	count, count, #64 +	b.ge	1b +	tst	count, #0x3f +	add	dst, dst, #16 +	b.ne	.Ltail63 +.Lexitfunc: +	ret + +	/* +	* For zeroing memory, check to see if we can use the ZVA feature to +	* zero entire 'cache' lines. +	*/ +.Lzero_mem: +	cmp	count, #63 +	b.le	.Ltail63 +	/* +	* For zeroing small amounts of memory, it's not worth setting up +	* the line-clear code. +	*/ +	cmp	count, #128 +	b.lt	.Lnot_short /*count is at least  128 bytes*/ + +	mrs	tmp1, dczid_el0 +	tbnz	tmp1, #4, .Lnot_short +	mov	tmp3w, #4 +	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */ +	lsl	zva_len, tmp3w, zva_len + +	ands	tmp3w, zva_len, #63 +	/* +	* ensure the zva_len is not less than 64. +	* It is not meaningful to use ZVA if the block size is less than 64. +	*/ +	b.ne	.Lnot_short +.Lzero_by_line: +	/* +	* Compute how far we need to go to become suitably aligned. We're +	* already at quad-word alignment. +	*/ +	cmp	count, zva_len_x +	b.lt	.Lnot_short		/* Not enough to reach alignment.  */ +	sub	zva_bits_x, zva_len_x, #1 +	neg	tmp2, dst +	ands	tmp2, tmp2, zva_bits_x +	b.eq	2f			/* Already aligned.  */ +	/* Not aligned, check that there's enough to copy after alignment.*/ +	sub	tmp1, count, tmp2 +	/* +	* grantee the remain length to be ZVA is bigger than 64, +	* avoid to make the 2f's process over mem range.*/ +	cmp	tmp1, #64 +	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */ +	b.lt	.Lnot_short +	/* +	* We know that there's at least 64 bytes to zero and that it's safe +	* to overrun by 64 bytes. +	*/ +	mov	count, tmp1 +1: +	stp	A_l, A_l, [dst] +	stp	A_l, A_l, [dst, #16] +	stp	A_l, A_l, [dst, #32] +	subs	tmp2, tmp2, #64 +	stp	A_l, A_l, [dst, #48] +	add	dst, dst, #64 +	b.ge	1b +	/* We've overrun a bit, so adjust dst downwards.*/ +	add	dst, dst, tmp2 +2: +	sub	count, count, zva_len_x +3: +	dc	zva, dst +	add	dst, dst, zva_len_x +	subs	count, count, zva_len_x +	b.ge	3b +	ands	count, count, zva_bits_x +	b.ne	.Ltail_maybe_long +	ret +ENDPROC(memset) diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S new file mode 100644 index 00000000000..dae0cf5591f --- /dev/null +++ b/arch/arm64/lib/strchr.S @@ -0,0 +1,42 @@ +/* + * Based on arch/arm/lib/strchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find the first occurrence of a character in a string. + * + * Parameters: + *	x0 - str + *	x1 - c + * Returns: + *	x0 - address of first occurrence of 'c' or 0 + */ +ENTRY(strchr) +	and	w1, w1, #0xff +1:	ldrb	w2, [x0], #1 +	cmp	w2, w1 +	ccmp	w2, wzr, #4, ne +	b.ne	1b +	sub	x0, x0, #1 +	cmp	w2, w1 +	csel	x0, x0, xzr, eq +	ret +ENDPROC(strchr) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 00000000000..42f828b06c5 --- /dev/null +++ b/arch/arm64/lib/strcmp.S @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + *	x0 - const string 1 pointer + *    x1 - const string 2 pointer + * Returns: + * x0 - an integer less than, equal to, or greater than zero + * if  s1  is  found, respectively, to be less than, to match, + * or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result.  */ +src1		.req	x0 +src2		.req	x1 +result		.req	x0 + +/* Internal variables.  */ +data1		.req	x2 +data1w		.req	w2 +data2		.req	x3 +data2w		.req	w3 +has_nul		.req	x4 +diff		.req	x5 +syndrome	.req	x6 +tmp1		.req	x7 +tmp2		.req	x8 +tmp3		.req	x9 +zeroones	.req	x10 +pos		.req	x11 + +ENTRY(strcmp) +	eor	tmp1, src1, src2 +	mov	zeroones, #REP8_01 +	tst	tmp1, #7 +	b.ne	.Lmisaligned8 +	ands	tmp1, src1, #7 +	b.ne	.Lmutual_align + +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +.Lloop_aligned: +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +.Lstart_realigned: +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2	/* Non-zero if differences found.  */ +	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */ +	orr	syndrome, diff, has_nul +	cbz	syndrome, .Lloop_aligned +	b	.Lcal_cmpresult + +.Lmutual_align: +	/* +	* Sources are mutually aligned, but are not currently at an +	* alignment boundary.  Round down the addresses and then mask off +	* the bytes that preceed the start point. +	*/ +	bic	src1, src1, #7 +	bic	src2, src2, #7 +	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ +	ldr	data1, [src1], #8 +	neg	tmp1, tmp1		/* Bits to alignment -64.  */ +	ldr	data2, [src2], #8 +	mov	tmp2, #~0 +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ + +	orr	data1, data1, tmp2 +	orr	data2, data2, tmp2 +	b	.Lstart_realigned + +.Lmisaligned8: +	/* +	* Get the align offset length to compare per byte first. +	* After this process, one string's address will be aligned. +	*/ +	and	tmp1, src1, #7 +	neg	tmp1, tmp1 +	add	tmp1, tmp1, #8 +	and	tmp2, src2, #7 +	neg	tmp2, tmp2 +	add	tmp2, tmp2, #8 +	subs	tmp3, tmp1, tmp2 +	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */ +.Ltinycmp: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	pos, pos, #1 +	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */ +	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ +	b.eq	.Ltinycmp +	cbnz	pos, 1f /*find the null or unequal...*/ +	cmp	data1w, #1 +	ccmp	data1w, data2w, #0, cs +	b.eq	.Lstart_align /*the last bytes are equal....*/ +1: +	sub	result, data1, data2 +	ret + +.Lstart_align: +	ands	xzr, src1, #7 +	b.eq	.Lrecal_offset +	/*process more leading bytes to make str1 aligned...*/ +	add	src1, src1, tmp3 +	add	src2, src2, tmp3 +	/*load 8 bytes from aligned str1 and non-aligned str2..*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 + +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bic	has_nul, tmp1, tmp2 +	eor	diff, data1, data2 /* Non-zero if differences found.  */ +	orr	syndrome, diff, has_nul +	cbnz	syndrome, .Lcal_cmpresult +	/*How far is the current str2 from the alignment boundary...*/ +	and	tmp3, tmp3, #7 +.Lrecal_offset: +	neg	pos, tmp3 +.Lloopcmp_proc: +	/* +	* Divide the eight bytes into two parts. First,backwards the src2 +	* to an alignment boundary,load eight bytes from the SRC2 alignment +	* boundary,then compare with the relative bytes from SRC1. +	* If all 8 bytes are equal,then start the second part's comparison. +	* Otherwise finish the comparison. +	* This special handle can garantee all the accesses are in the +	* thread/task space in avoid to overrange access. +	*/ +	ldr	data1, [src1,pos] +	ldr	data2, [src2,pos] +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bic	has_nul, tmp1, tmp2 +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	orr	syndrome, diff, has_nul +	cbnz	syndrome, .Lcal_cmpresult + +	/*The second part process*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bic	has_nul, tmp1, tmp2 +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	orr	syndrome, diff, has_nul +	cbz	syndrome, .Lloopcmp_proc + +.Lcal_cmpresult: +	/* +	* reversed the byte-order as big-endian,then CLZ can find the most +	* significant zero bits. +	*/ +CPU_LE( rev	syndrome, syndrome ) +CPU_LE( rev	data1, data1 ) +CPU_LE( rev	data2, data2 ) + +	/* +	* For big-endian we cannot use the trick with the syndrome value +	* as carry-propagation can corrupt the upper bits if the trailing +	* bytes in the string contain 0x01. +	* However, if there is no NUL byte in the dword, we can generate +	* the result directly.  We ca not just subtract the bytes as the +	* MSB might be significant. +	*/ +CPU_BE( cbnz	has_nul, 1f ) +CPU_BE( cmp	data1, data2 ) +CPU_BE( cset	result, ne ) +CPU_BE( cneg	result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) +	/*Re-compute the NUL-byte detection, using a byte-reversed value. */ +CPU_BE(	rev	tmp3, data1 ) +CPU_BE(	sub	tmp1, tmp3, zeroones ) +CPU_BE(	orr	tmp2, tmp3, #REP8_7f ) +CPU_BE(	bic	has_nul, tmp1, tmp2 ) +CPU_BE(	rev	has_nul, has_nul ) +CPU_BE(	orr	syndrome, diff, has_nul ) + +	clz	pos, syndrome +	/* +	* The MS-non-zero bit of the syndrome marks either the first bit +	* that is different, or the top bit of the first zero byte. +	* Shifting left now will bring the critical information into the +	* top bits. +	*/ +	lsl	data1, data1, pos +	lsl	data2, data2, pos +	/* +	* But we need to zero-extend (char is unsigned) the value and then +	* perform a signed 32-bit subtraction. +	*/ +	lsr	data1, data1, #56 +	sub	result, data1, data2, lsr #56 +	ret +ENDPROC(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 00000000000..987b68b9ce4 --- /dev/null +++ b/arch/arm64/lib/strlen.S @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * calculate the length of a string + * + * Parameters: + *	x0 - const string pointer + * Returns: + *	x0 - the return length of specific string + */ + +/* Arguments and results.  */ +srcin		.req	x0 +len		.req	x0 + +/* Locals and temporaries.  */ +src		.req	x1 +data1		.req	x2 +data2		.req	x3 +data2a		.req	x4 +has_nul1	.req	x5 +has_nul2	.req	x6 +tmp1		.req	x7 +tmp2		.req	x8 +tmp3		.req	x9 +tmp4		.req	x10 +zeroones	.req	x11 +pos		.req	x12 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strlen) +	mov	zeroones, #REP8_01 +	bic	src, srcin, #15 +	ands	tmp1, srcin, #15 +	b.ne	.Lmisaligned +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +	/* +	* The inner loop deals with two Dwords at a time. This has a +	* slightly higher start-up cost, but we should win quite quickly, +	* especially on cores with a high number of issue slots per +	* cycle, as we get much better parallelism out of the operations. +	*/ +.Lloop: +	ldp	data1, data2, [src], #16 +.Lrealigned: +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	sub	tmp3, data2, zeroones +	orr	tmp4, data2, #REP8_7f +	bic	has_nul1, tmp1, tmp2 +	bics	has_nul2, tmp3, tmp4 +	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */ +	b.eq	.Lloop + +	sub	len, src, srcin +	cbz	has_nul1, .Lnul_in_data2 +CPU_BE(	mov	data2, data1 )	/*prepare data to re-calculate the syndrome*/ +	sub	len, len, #8 +	mov	has_nul2, has_nul1 +.Lnul_in_data2: +	/* +	* For big-endian, carry propagation (if the final byte in the +	* string is 0x01) means we cannot use has_nul directly.  The +	* easiest way to get the correct byte is to byte-swap the data +	* and calculate the syndrome a second time. +	*/ +CPU_BE( rev	data2, data2 ) +CPU_BE( sub	tmp1, data2, zeroones ) +CPU_BE( orr	tmp2, data2, #REP8_7f ) +CPU_BE( bic	has_nul2, tmp1, tmp2 ) + +	sub	len, len, #8 +	rev	has_nul2, has_nul2 +	clz	pos, has_nul2 +	add	len, len, pos, lsr #3		/* Bits to bytes.  */ +	ret + +.Lmisaligned: +	cmp	tmp1, #8 +	neg	tmp1, tmp1 +	ldp	data1, data2, [src], #16 +	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ +	mov	tmp2, #~0 +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ + +	orr	data1, data1, tmp2 +	orr	data2a, data2, tmp2 +	csinv	data1, data1, xzr, le +	csel	data2, data2, data2a, le +	b	.Lrealigned +ENDPROC(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 00000000000..0224cf5a553 --- /dev/null +++ b/arch/arm64/lib/strncmp.S @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + *  x0 - const string 1 pointer + *  x1 - const string 2 pointer + *  x2 - the maximal length to be compared + * Returns: + *  x0 - an integer less than, equal to, or greater than zero if s1 is found, + *     respectively, to be less than, to match, or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result.  */ +src1		.req	x0 +src2		.req	x1 +limit		.req	x2 +result		.req	x0 + +/* Internal variables.  */ +data1		.req	x3 +data1w		.req	w3 +data2		.req	x4 +data2w		.req	w4 +has_nul		.req	x5 +diff		.req	x6 +syndrome	.req	x7 +tmp1		.req	x8 +tmp2		.req	x9 +tmp3		.req	x10 +zeroones	.req	x11 +pos		.req	x12 +limit_wd	.req	x13 +mask		.req	x14 +endloop		.req	x15 + +ENTRY(strncmp) +	cbz	limit, .Lret0 +	eor	tmp1, src1, src2 +	mov	zeroones, #REP8_01 +	tst	tmp1, #7 +	b.ne	.Lmisaligned8 +	ands	tmp1, src1, #7 +	b.ne	.Lmutual_align +	/* Calculate the number of full and partial words -1.  */ +	/* +	* when limit is mulitply of 8, if not sub 1, +	* the judgement of last dword will wrong. +	*/ +	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */ +	lsr	limit_wd, limit_wd, #3  /* Convert to Dwords.  */ + +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +.Lloop_aligned: +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +.Lstart_realigned: +	subs	limit_wd, limit_wd, #1 +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, pl  /* Last Dword or differences.*/ +	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */ +	ccmp	endloop, #0, #0, eq +	b.eq	.Lloop_aligned + +	/*Not reached the limit, must have found the end or a diff.  */ +	tbz	limit_wd, #63, .Lnot_limit + +	/* Limit % 8 == 0 => all bytes significant.  */ +	ands	limit, limit, #7 +	b.eq	.Lnot_limit + +	lsl	limit, limit, #3    /* Bits -> bytes.  */ +	mov	mask, #~0 +CPU_BE( lsr	mask, mask, limit ) +CPU_LE( lsl	mask, mask, limit ) +	bic	data1, data1, mask +	bic	data2, data2, mask + +	/* Make sure that the NUL byte is marked in the syndrome.  */ +	orr	has_nul, has_nul, mask + +.Lnot_limit: +	orr	syndrome, diff, has_nul +	b	.Lcal_cmpresult + +.Lmutual_align: +	/* +	* Sources are mutually aligned, but are not currently at an +	* alignment boundary.  Round down the addresses and then mask off +	* the bytes that precede the start point. +	* We also need to adjust the limit calculations, but without +	* overflowing if the limit is near ULONG_MAX. +	*/ +	bic	src1, src1, #7 +	bic	src2, src2, #7 +	ldr	data1, [src1], #8 +	neg	tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */ +	ldr	data2, [src2], #8 +	mov	tmp2, #~0 +	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */ +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */ + +	and	tmp3, limit_wd, #7 +	lsr	limit_wd, limit_wd, #3 +	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ +	add	limit, limit, tmp1 +	add	tmp3, tmp3, tmp1 +	orr	data1, data1, tmp2 +	orr	data2, data2, tmp2 +	add	limit_wd, limit_wd, tmp3, lsr #3 +	b	.Lstart_realigned + +/*when src1 offset is not equal to src2 offset...*/ +.Lmisaligned8: +	cmp	limit, #8 +	b.lo	.Ltiny8proc /*limit < 8... */ +	/* +	* Get the align offset length to compare per byte first. +	* After this process, one string's address will be aligned.*/ +	and	tmp1, src1, #7 +	neg	tmp1, tmp1 +	add	tmp1, tmp1, #8 +	and	tmp2, src2, #7 +	neg	tmp2, tmp2 +	add	tmp2, tmp2, #8 +	subs	tmp3, tmp1, tmp2 +	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */ +	/* +	* Here, limit is not less than 8, so directly run .Ltinycmp +	* without checking the limit.*/ +	sub	limit, limit, pos +.Ltinycmp: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	pos, pos, #1 +	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */ +	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ +	b.eq	.Ltinycmp +	cbnz	pos, 1f /*find the null or unequal...*/ +	cmp	data1w, #1 +	ccmp	data1w, data2w, #0, cs +	b.eq	.Lstart_align /*the last bytes are equal....*/ +1: +	sub	result, data1, data2 +	ret + +.Lstart_align: +	lsr	limit_wd, limit, #3 +	cbz	limit_wd, .Lremain8 +	/*process more leading bytes to make str1 aligned...*/ +	ands	xzr, src1, #7 +	b.eq	.Lrecal_offset +	add	src1, src1, tmp3	/*tmp3 is positive in this branch.*/ +	add	src2, src2, tmp3 +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 + +	sub	limit, limit, tmp3 +	lsr	limit_wd, limit, #3 +	subs	limit_wd, limit_wd, #1 + +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ +	bics	has_nul, tmp1, tmp2 +	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ +	b.ne	.Lunequal_proc +	/*How far is the current str2 from the alignment boundary...*/ +	and	tmp3, tmp3, #7 +.Lrecal_offset: +	neg	pos, tmp3 +.Lloopcmp_proc: +	/* +	* Divide the eight bytes into two parts. First,backwards the src2 +	* to an alignment boundary,load eight bytes from the SRC2 alignment +	* boundary,then compare with the relative bytes from SRC1. +	* If all 8 bytes are equal,then start the second part's comparison. +	* Otherwise finish the comparison. +	* This special handle can garantee all the accesses are in the +	* thread/task space in avoid to overrange access. +	*/ +	ldr	data1, [src1,pos] +	ldr	data2, [src2,pos] +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */ +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, eq +	cbnz	endloop, .Lunequal_proc + +	/*The second part process*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	subs	limit_wd, limit_wd, #1 +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ +	bics	has_nul, tmp1, tmp2 +	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ +	b.eq	.Lloopcmp_proc + +.Lunequal_proc: +	orr	syndrome, diff, has_nul +	cbz	syndrome, .Lremain8 +.Lcal_cmpresult: +	/* +	* reversed the byte-order as big-endian,then CLZ can find the most +	* significant zero bits. +	*/ +CPU_LE( rev	syndrome, syndrome ) +CPU_LE( rev	data1, data1 ) +CPU_LE( rev	data2, data2 ) +	/* +	* For big-endian we cannot use the trick with the syndrome value +	* as carry-propagation can corrupt the upper bits if the trailing +	* bytes in the string contain 0x01. +	* However, if there is no NUL byte in the dword, we can generate +	* the result directly.  We can't just subtract the bytes as the +	* MSB might be significant. +	*/ +CPU_BE( cbnz	has_nul, 1f ) +CPU_BE( cmp	data1, data2 ) +CPU_BE( cset	result, ne ) +CPU_BE( cneg	result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) +	/* Re-compute the NUL-byte detection, using a byte-reversed value.*/ +CPU_BE( rev	tmp3, data1 ) +CPU_BE( sub	tmp1, tmp3, zeroones ) +CPU_BE( orr	tmp2, tmp3, #REP8_7f ) +CPU_BE( bic	has_nul, tmp1, tmp2 ) +CPU_BE( rev	has_nul, has_nul ) +CPU_BE( orr	syndrome, diff, has_nul ) +	/* +	* The MS-non-zero bit of the syndrome marks either the first bit +	* that is different, or the top bit of the first zero byte. +	* Shifting left now will bring the critical information into the +	* top bits. +	*/ +	clz	pos, syndrome +	lsl	data1, data1, pos +	lsl	data2, data2, pos +	/* +	* But we need to zero-extend (char is unsigned) the value and then +	* perform a signed 32-bit subtraction. +	*/ +	lsr	data1, data1, #56 +	sub	result, data1, data2, lsr #56 +	ret + +.Lremain8: +	/* Limit % 8 == 0 => all bytes significant.  */ +	ands	limit, limit, #7 +	b.eq	.Lret0 +.Ltiny8proc: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	limit, limit, #1 + +	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */ +	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ +	b.eq	.Ltiny8proc +	sub	result, data1, data2 +	ret + +.Lret0: +	mov	result, #0 +	ret +ENDPROC(strncmp) diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 00000000000..2ca665711bf --- /dev/null +++ b/arch/arm64/lib/strnlen.S @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * determine the length of a fixed-size string + * + * Parameters: + *	x0 - const string pointer + *	x1 - maximal string length + * Returns: + *	x0 - the return length of specific string + */ + +/* Arguments and results.  */ +srcin		.req	x0 +len		.req	x0 +limit		.req	x1 + +/* Locals and temporaries.  */ +src		.req	x2 +data1		.req	x3 +data2		.req	x4 +data2a		.req	x5 +has_nul1	.req	x6 +has_nul2	.req	x7 +tmp1		.req	x8 +tmp2		.req	x9 +tmp3		.req	x10 +tmp4		.req	x11 +zeroones	.req	x12 +pos		.req	x13 +limit_wd	.req	x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strnlen) +	cbz	limit, .Lhit_limit +	mov	zeroones, #REP8_01 +	bic	src, srcin, #15 +	ands	tmp1, srcin, #15 +	b.ne	.Lmisaligned +	/* Calculate the number of full and partial words -1.  */ +	sub	limit_wd, limit, #1 /* Limit != 0, so no underflow.  */ +	lsr	limit_wd, limit_wd, #4  /* Convert to Qwords.  */ + +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +	/* +	* The inner loop deals with two Dwords at a time.  This has a +	* slightly higher start-up cost, but we should win quite quickly, +	* especially on cores with a high number of issue slots per +	* cycle, as we get much better parallelism out of the operations. +	*/ +.Lloop: +	ldp	data1, data2, [src], #16 +.Lrealigned: +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	sub	tmp3, data2, zeroones +	orr	tmp4, data2, #REP8_7f +	bic	has_nul1, tmp1, tmp2 +	bic	has_nul2, tmp3, tmp4 +	subs	limit_wd, limit_wd, #1 +	orr	tmp1, has_nul1, has_nul2 +	ccmp	tmp1, #0, #0, pl    /* NZCV = 0000  */ +	b.eq	.Lloop + +	cbz	tmp1, .Lhit_limit   /* No null in final Qword.  */ + +	/* +	* We know there's a null in the final Qword. The easiest thing +	* to do now is work out the length of the string and return +	* MIN (len, limit). +	*/ +	sub	len, src, srcin +	cbz	has_nul1, .Lnul_in_data2 +CPU_BE( mov	data2, data1 )	/*perpare data to re-calculate the syndrome*/ + +	sub	len, len, #8 +	mov	has_nul2, has_nul1 +.Lnul_in_data2: +	/* +	* For big-endian, carry propagation (if the final byte in the +	* string is 0x01) means we cannot use has_nul directly.  The +	* easiest way to get the correct byte is to byte-swap the data +	* and calculate the syndrome a second time. +	*/ +CPU_BE( rev	data2, data2 ) +CPU_BE( sub	tmp1, data2, zeroones ) +CPU_BE( orr	tmp2, data2, #REP8_7f ) +CPU_BE( bic	has_nul2, tmp1, tmp2 ) + +	sub	len, len, #8 +	rev	has_nul2, has_nul2 +	clz	pos, has_nul2 +	add	len, len, pos, lsr #3       /* Bits to bytes.  */ +	cmp	len, limit +	csel	len, len, limit, ls     /* Return the lower value.  */ +	ret + +.Lmisaligned: +	/* +	* Deal with a partial first word. +	* We're doing two things in parallel here; +	* 1) Calculate the number of words (but avoiding overflow if +	* limit is near ULONG_MAX) - to do this we need to work out +	* limit + tmp1 - 1 as a 65-bit value before shifting it; +	* 2) Load and mask the initial data words - we force the bytes +	* before the ones we are interested in to 0xff - this ensures +	* early bytes will not hit any zero detection. +	*/ +	ldp	data1, data2, [src], #16 + +	sub	limit_wd, limit, #1 +	and	tmp3, limit_wd, #15 +	lsr	limit_wd, limit_wd, #4 + +	add	tmp3, tmp3, tmp1 +	add	limit_wd, limit_wd, tmp3, lsr #4 + +	neg	tmp4, tmp1 +	lsl	tmp4, tmp4, #3  /* Bytes beyond alignment -> bits.  */ + +	mov	tmp2, #~0 +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */ + +	cmp	tmp1, #8 + +	orr	data1, data1, tmp2 +	orr	data2a, data2, tmp2 + +	csinv	data1, data1, xzr, le +	csel	data2, data2, data2a, le +	b	.Lrealigned + +.Lhit_limit: +	mov	len, limit +	ret +ENDPROC(strnlen) diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S new file mode 100644 index 00000000000..61eabd9a289 --- /dev/null +++ b/arch/arm64/lib/strrchr.S @@ -0,0 +1,43 @@ +/* + * Based on arch/arm/lib/strrchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find the last occurrence of a character in a string. + * + * Parameters: + *	x0 - str + *	x1 - c + * Returns: + *	x0 - address of last occurrence of 'c' or 0 + */ +ENTRY(strrchr) +	mov	x3, #0 +	and	w1, w1, #0xff +1:	ldrb	w2, [x0], #1 +	cbz	w2, 2f +	cmp	w2, w1 +	b.ne	1b +	sub	x3, x0, #1 +	b	1b +2:	mov	x0, x3 +	ret +ENDPROC(strrchr)  | 
