diff options
Diffstat (limited to 'arch/arm64/lib')
| -rw-r--r-- | arch/arm64/lib/Makefile | 9 | ||||
| -rw-r--r-- | arch/arm64/lib/bitops.S | 3 | ||||
| -rw-r--r-- | arch/arm64/lib/memcmp.S | 258 | ||||
| -rw-r--r-- | arch/arm64/lib/memcpy.S | 192 | ||||
| -rw-r--r-- | arch/arm64/lib/memmove.S | 190 | ||||
| -rw-r--r-- | arch/arm64/lib/memset.S | 207 | ||||
| -rw-r--r-- | arch/arm64/lib/strcmp.S | 234 | ||||
| -rw-r--r-- | arch/arm64/lib/strlen.S | 126 | ||||
| -rw-r--r-- | arch/arm64/lib/strncmp.S | 310 | ||||
| -rw-r--r-- | arch/arm64/lib/strncpy_from_user.S | 50 | ||||
| -rw-r--r-- | arch/arm64/lib/strnlen.S | 171 | ||||
| -rw-r--r-- | arch/arm64/lib/strnlen_user.S | 47 | 
12 files changed, 1625 insertions, 172 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 59acc0ef046..d98d3e39879 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,6 +1,5 @@ -lib-y		:= bitops.o delay.o					\ -		   strncpy_from_user.o strnlen_user.o clear_user.o	\ -		   copy_from_user.o copy_to_user.o copy_in_user.o	\ -		   copy_page.o clear_page.o				\ -		   memchr.o memcpy.o memmove.o memset.o			\ +lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\ +		   copy_to_user.o copy_in_user.o copy_page.o		\ +		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\ +		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\  		   strchr.o strrchr.o diff --git a/arch/arm64/lib/bitops.S b/arch/arm64/lib/bitops.S index e5db797790d..7dac371cc9a 100644 --- a/arch/arm64/lib/bitops.S +++ b/arch/arm64/lib/bitops.S @@ -46,11 +46,12 @@ ENTRY(	\name	)  	mov	x2, #1  	add	x1, x1, x0, lsr #3	// Get word offset  	lsl	x4, x2, x3		// Create mask -1:	ldaxr	x2, [x1] +1:	ldxr	x2, [x1]  	lsr	x0, x2, x3		// Save old value of bit  	\instr	x2, x2, x4		// toggle bit  	stlxr	w5, x2, [x1]  	cbnz	w5, 1b +	dmb	ish  	and	x0, x0, #1  3:	ret  ENDPROC(\name	) diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 00000000000..6ea0776ba6d --- /dev/null +++ b/arch/arm64/lib/memcmp.S @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* +* compare memory areas(when two memory areas' offset are different, +* alignment handled by the hardware) +* +* Parameters: +*  x0 - const memory area 1 pointer +*  x1 - const memory area 2 pointer +*  x2 - the maximal compare byte length +* Returns: +*  x0 - a compare result, maybe less than, equal to, or greater than ZERO +*/ + +/* Parameters and result.  */ +src1		.req	x0 +src2		.req	x1 +limit		.req	x2 +result		.req	x0 + +/* Internal variables.  */ +data1		.req	x3 +data1w		.req	w3 +data2		.req	x4 +data2w		.req	w4 +has_nul		.req	x5 +diff		.req	x6 +endloop		.req	x7 +tmp1		.req	x8 +tmp2		.req	x9 +tmp3		.req	x10 +pos		.req	x11 +limit_wd	.req	x12 +mask		.req	x13 + +ENTRY(memcmp) +	cbz	limit, .Lret0 +	eor	tmp1, src1, src2 +	tst	tmp1, #7 +	b.ne	.Lmisaligned8 +	ands	tmp1, src1, #7 +	b.ne	.Lmutual_align +	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */ +	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */ +	/* +	* The input source addresses are at alignment boundary. +	* Directly compare eight bytes each time. +	*/ +.Lloop_aligned: +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +.Lstart_realigned: +	subs	limit_wd, limit_wd, #1 +	eor	diff, data1, data2	/* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */ +	cbz	endloop, .Lloop_aligned + +	/* Not reached the limit, must have found a diff.  */ +	tbz	limit_wd, #63, .Lnot_limit + +	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */ +	ands	limit, limit, #7 +	b.eq	.Lnot_limit +	/* +	* The remained bytes less than 8. It is needed to extract valid data +	* from last eight bytes of the intended memory range. +	*/ +	lsl	limit, limit, #3	/* bytes-> bits.  */ +	mov	mask, #~0 +CPU_BE( lsr	mask, mask, limit ) +CPU_LE( lsl	mask, mask, limit ) +	bic	data1, data1, mask +	bic	data2, data2, mask + +	orr	diff, diff, mask +	b	.Lnot_limit + +.Lmutual_align: +	/* +	* Sources are mutually aligned, but are not currently at an +	* alignment boundary. Round down the addresses and then mask off +	* the bytes that precede the start point. +	*/ +	bic	src1, src1, #7 +	bic	src2, src2, #7 +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	/* +	* We can not add limit with alignment offset(tmp1) here. Since the +	* addition probably make the limit overflown. +	*/ +	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/ +	and	tmp3, limit_wd, #7 +	lsr	limit_wd, limit_wd, #3 +	add	tmp3, tmp3, tmp1 +	add	limit_wd, limit_wd, tmp3, lsr #3 +	add	limit, limit, tmp1/* Adjust the limit for the extra.  */ + +	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ +	neg	tmp1, tmp1/* Bits to alignment -64.  */ +	mov	tmp2, #~0 +	/*mask off the non-intended bytes before the start address.*/ +CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp1 ) + +	orr	data1, data1, tmp2 +	orr	data2, data2, tmp2 +	b	.Lstart_realigned + +	/*src1 and src2 have different alignment offset.*/ +.Lmisaligned8: +	cmp	limit, #8 +	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/ + +	and	tmp1, src1, #7 +	neg	tmp1, tmp1 +	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ +	and	tmp2, src2, #7 +	neg	tmp2, tmp2 +	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ +	subs	tmp3, tmp1, tmp2 +	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/ + +	sub	limit, limit, pos +	/*compare the proceeding bytes in the first 8 byte segment.*/ +.Ltinycmp: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	pos, pos, #1 +	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */ +	b.eq	.Ltinycmp +	cbnz	pos, 1f /*diff occurred before the last byte.*/ +	cmp	data1w, data2w +	b.eq	.Lstart_align +1: +	sub	result, data1, data2 +	ret + +.Lstart_align: +	lsr	limit_wd, limit, #3 +	cbz	limit_wd, .Lremain8 + +	ands	xzr, src1, #7 +	b.eq	.Lrecal_offset +	/*process more leading bytes to make src1 aligned...*/ +	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/ +	add	src2, src2, tmp3 +	sub	limit, limit, tmp3 +	lsr	limit_wd, limit, #3 +	cbz	limit_wd, .Lremain8 +	/*load 8 bytes from aligned SRC1..*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 + +	subs	limit_wd, limit_wd, #1 +	eor	diff, data1, data2  /*Non-zero if differences found.*/ +	csinv	endloop, diff, xzr, ne +	cbnz	endloop, .Lunequal_proc +	/*How far is the current SRC2 from the alignment boundary...*/ +	and	tmp3, tmp3, #7 + +.Lrecal_offset:/*src1 is aligned now..*/ +	neg	pos, tmp3 +.Lloopcmp_proc: +	/* +	* Divide the eight bytes into two parts. First,backwards the src2 +	* to an alignment boundary,load eight bytes and compare from +	* the SRC2 alignment boundary. If all 8 bytes are equal,then start +	* the second part's comparison. Otherwise finish the comparison. +	* This special handle can garantee all the accesses are in the +	* thread/task space in avoid to overrange access. +	*/ +	ldr	data1, [src1,pos] +	ldr	data2, [src2,pos] +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	cbnz	diff, .Lnot_limit + +	/*The second part process*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	subs	limit_wd, limit_wd, #1 +	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ +	cbz	endloop, .Lloopcmp_proc +.Lunequal_proc: +	cbz	diff, .Lremain8 + +/*There is differnence occured in the latest comparison.*/ +.Lnot_limit: +/* +* For little endian,reverse the low significant equal bits into MSB,then +* following CLZ can find how many equal bits exist. +*/ +CPU_LE( rev	diff, diff ) +CPU_LE( rev	data1, data1 ) +CPU_LE( rev	data2, data2 ) + +	/* +	* The MS-non-zero bit of DIFF marks either the first bit +	* that is different, or the end of the significant data. +	* Shifting left now will bring the critical information into the +	* top bits. +	*/ +	clz	pos, diff +	lsl	data1, data1, pos +	lsl	data2, data2, pos +	/* +	* We need to zero-extend (char is unsigned) the value and then +	* perform a signed subtraction. +	*/ +	lsr	data1, data1, #56 +	sub	result, data1, data2, lsr #56 +	ret + +.Lremain8: +	/* Limit % 8 == 0 =>. all data are equal.*/ +	ands	limit, limit, #7 +	b.eq	.Lret0 + +.Ltiny8proc: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	limit, limit, #1 + +	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */ +	b.eq	.Ltiny8proc +	sub	result, data1, data2 +	ret +.Lret0: +	mov	result, #0 +	ret +ENDPROC(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 27b5003609b..8a9a96d3dda 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -1,5 +1,13 @@  /*   * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@  #include <linux/linkage.h>  #include <asm/assembler.h> +#include <asm/cache.h>  /*   * Copy a buffer from src to dest (alignment handled by the hardware) @@ -27,27 +36,166 @@   * Returns:   *	x0 - dest   */ +dstin	.req	x0 +src	.req	x1 +count	.req	x2 +tmp1	.req	x3 +tmp1w	.req	w3 +tmp2	.req	x4 +tmp2w	.req	w4 +tmp3	.req	x5 +tmp3w	.req	w5 +dst	.req	x6 + +A_l	.req	x7 +A_h	.req	x8 +B_l	.req	x9 +B_h	.req	x10 +C_l	.req	x11 +C_h	.req	x12 +D_l	.req	x13 +D_h	.req	x14 +  ENTRY(memcpy) -	mov	x4, x0 -	subs	x2, x2, #8 -	b.mi	2f -1:	ldr	x3, [x1], #8 -	subs	x2, x2, #8 -	str	x3, [x4], #8 -	b.pl	1b -2:	adds	x2, x2, #4 -	b.mi	3f -	ldr	w3, [x1], #4 -	sub	x2, x2, #4 -	str	w3, [x4], #4 -3:	adds	x2, x2, #2 -	b.mi	4f -	ldrh	w3, [x1], #2 -	sub	x2, x2, #2 -	strh	w3, [x4], #2 -4:	adds	x2, x2, #1 -	b.mi	5f -	ldrb	w3, [x1] -	strb	w3, [x4] -5:	ret +	mov	dst, dstin +	cmp	count, #16 +	/*When memory length is less than 16, the accessed are not aligned.*/ +	b.lo	.Ltiny15 + +	neg	tmp2, src +	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */ +	b.eq	.LSrcAligned +	sub	count, count, tmp2 +	/* +	* Copy the leading memory data from src to dst in an increasing +	* address order.By this way,the risk of overwritting the source +	* memory data is eliminated when the distance between src and +	* dst is less than 16. The memory accesses here are alignment. +	*/ +	tbz	tmp2, #0, 1f +	ldrb	tmp1w, [src], #1 +	strb	tmp1w, [dst], #1 +1: +	tbz	tmp2, #1, 2f +	ldrh	tmp1w, [src], #2 +	strh	tmp1w, [dst], #2 +2: +	tbz	tmp2, #2, 3f +	ldr	tmp1w, [src], #4 +	str	tmp1w, [dst], #4 +3: +	tbz	tmp2, #3, .LSrcAligned +	ldr	tmp1, [src],#8 +	str	tmp1, [dst],#8 + +.LSrcAligned: +	cmp	count, #64 +	b.ge	.Lcpy_over64 +	/* +	* Deal with small copies quickly by dropping straight into the +	* exit block. +	*/ +.Ltail63: +	/* +	* Copy up to 48 bytes of data. At this point we only need the +	* bottom 6 bits of count to be accurate. +	*/ +	ands	tmp1, count, #0x30 +	b.eq	.Ltiny15 +	cmp	tmp1w, #0x20 +	b.eq	1f +	b.lt	2f +	ldp	A_l, A_h, [src], #16 +	stp	A_l, A_h, [dst], #16 +1: +	ldp	A_l, A_h, [src], #16 +	stp	A_l, A_h, [dst], #16 +2: +	ldp	A_l, A_h, [src], #16 +	stp	A_l, A_h, [dst], #16 +.Ltiny15: +	/* +	* Prefer to break one ldp/stp into several load/store to access +	* memory in an increasing address order,rather than to load/store 16 +	* bytes from (src-16) to (dst-16) and to backward the src to aligned +	* address,which way is used in original cortex memcpy. If keeping +	* the original memcpy process here, memmove need to satisfy the +	* precondition that src address is at least 16 bytes bigger than dst +	* address,otherwise some source data will be overwritten when memove +	* call memcpy directly. To make memmove simpler and decouple the +	* memcpy's dependency on memmove, withdrew the original process. +	*/ +	tbz	count, #3, 1f +	ldr	tmp1, [src], #8 +	str	tmp1, [dst], #8 +1: +	tbz	count, #2, 2f +	ldr	tmp1w, [src], #4 +	str	tmp1w, [dst], #4 +2: +	tbz	count, #1, 3f +	ldrh	tmp1w, [src], #2 +	strh	tmp1w, [dst], #2 +3: +	tbz	count, #0, .Lexitfunc +	ldrb	tmp1w, [src] +	strb	tmp1w, [dst] + +.Lexitfunc: +	ret + +.Lcpy_over64: +	subs	count, count, #128 +	b.ge	.Lcpy_body_large +	/* +	* Less than 128 bytes to copy, so handle 64 here and then jump +	* to the tail. +	*/ +	ldp	A_l, A_h, [src],#16 +	stp	A_l, A_h, [dst],#16 +	ldp	B_l, B_h, [src],#16 +	ldp	C_l, C_h, [src],#16 +	stp	B_l, B_h, [dst],#16 +	stp	C_l, C_h, [dst],#16 +	ldp	D_l, D_h, [src],#16 +	stp	D_l, D_h, [dst],#16 + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret + +	/* +	* Critical loop.  Start at a new cache line boundary.  Assuming +	* 64 bytes per line this ensures the entire loop is in one line. +	*/ +	.p2align	L1_CACHE_SHIFT +.Lcpy_body_large: +	/* pre-get 64 bytes data. */ +	ldp	A_l, A_h, [src],#16 +	ldp	B_l, B_h, [src],#16 +	ldp	C_l, C_h, [src],#16 +	ldp	D_l, D_h, [src],#16 +1: +	/* +	* interlace the load of next 64 bytes data block with store of the last +	* loaded 64 bytes data. +	*/ +	stp	A_l, A_h, [dst],#16 +	ldp	A_l, A_h, [src],#16 +	stp	B_l, B_h, [dst],#16 +	ldp	B_l, B_h, [src],#16 +	stp	C_l, C_h, [dst],#16 +	ldp	C_l, C_h, [src],#16 +	stp	D_l, D_h, [dst],#16 +	ldp	D_l, D_h, [src],#16 +	subs	count, count, #64 +	b.ge	1b +	stp	A_l, A_h, [dst],#16 +	stp	B_l, B_h, [dst],#16 +	stp	C_l, C_h, [dst],#16 +	stp	D_l, D_h, [dst],#16 + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret  ENDPROC(memcpy) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S index b79fdfa42d3..57b19ea2dad 100644 --- a/arch/arm64/lib/memmove.S +++ b/arch/arm64/lib/memmove.S @@ -1,5 +1,13 @@  /*   * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@  #include <linux/linkage.h>  #include <asm/assembler.h> +#include <asm/cache.h>  /*   * Move a buffer from src to test (alignment handled by the hardware). @@ -28,30 +37,161 @@   * Returns:   *	x0 - dest   */ +dstin	.req	x0 +src	.req	x1 +count	.req	x2 +tmp1	.req	x3 +tmp1w	.req	w3 +tmp2	.req	x4 +tmp2w	.req	w4 +tmp3	.req	x5 +tmp3w	.req	w5 +dst	.req	x6 + +A_l	.req	x7 +A_h	.req	x8 +B_l	.req	x9 +B_h	.req	x10 +C_l	.req	x11 +C_h	.req	x12 +D_l	.req	x13 +D_h	.req	x14 +  ENTRY(memmove) -	cmp	x0, x1 -	b.ls	memcpy -	add	x4, x0, x2 -	add	x1, x1, x2 -	subs	x2, x2, #8 -	b.mi	2f -1:	ldr	x3, [x1, #-8]! -	subs	x2, x2, #8 -	str	x3, [x4, #-8]! -	b.pl	1b -2:	adds	x2, x2, #4 -	b.mi	3f -	ldr	w3, [x1, #-4]! -	sub	x2, x2, #4 -	str	w3, [x4, #-4]! -3:	adds	x2, x2, #2 -	b.mi	4f -	ldrh	w3, [x1, #-2]! -	sub	x2, x2, #2 -	strh	w3, [x4, #-2]! -4:	adds	x2, x2, #1 -	b.mi	5f -	ldrb	w3, [x1, #-1] -	strb	w3, [x4, #-1] -5:	ret +	cmp	dstin, src +	b.lo	memcpy +	add	tmp1, src, count +	cmp	dstin, tmp1 +	b.hs	memcpy		/* No overlap.  */ + +	add	dst, dstin, count +	add	src, src, count +	cmp	count, #16 +	b.lo	.Ltail15  /*probably non-alignment accesses.*/ + +	ands	tmp2, src, #15     /* Bytes to reach alignment.  */ +	b.eq	.LSrcAligned +	sub	count, count, tmp2 +	/* +	* process the aligned offset length to make the src aligned firstly. +	* those extra instructions' cost is acceptable. It also make the +	* coming accesses are based on aligned address. +	*/ +	tbz	tmp2, #0, 1f +	ldrb	tmp1w, [src, #-1]! +	strb	tmp1w, [dst, #-1]! +1: +	tbz	tmp2, #1, 2f +	ldrh	tmp1w, [src, #-2]! +	strh	tmp1w, [dst, #-2]! +2: +	tbz	tmp2, #2, 3f +	ldr	tmp1w, [src, #-4]! +	str	tmp1w, [dst, #-4]! +3: +	tbz	tmp2, #3, .LSrcAligned +	ldr	tmp1, [src, #-8]! +	str	tmp1, [dst, #-8]! + +.LSrcAligned: +	cmp	count, #64 +	b.ge	.Lcpy_over64 + +	/* +	* Deal with small copies quickly by dropping straight into the +	* exit block. +	*/ +.Ltail63: +	/* +	* Copy up to 48 bytes of data. At this point we only need the +	* bottom 6 bits of count to be accurate. +	*/ +	ands	tmp1, count, #0x30 +	b.eq	.Ltail15 +	cmp	tmp1w, #0x20 +	b.eq	1f +	b.lt	2f +	ldp	A_l, A_h, [src, #-16]! +	stp	A_l, A_h, [dst, #-16]! +1: +	ldp	A_l, A_h, [src, #-16]! +	stp	A_l, A_h, [dst, #-16]! +2: +	ldp	A_l, A_h, [src, #-16]! +	stp	A_l, A_h, [dst, #-16]! + +.Ltail15: +	tbz	count, #3, 1f +	ldr	tmp1, [src, #-8]! +	str	tmp1, [dst, #-8]! +1: +	tbz	count, #2, 2f +	ldr	tmp1w, [src, #-4]! +	str	tmp1w, [dst, #-4]! +2: +	tbz	count, #1, 3f +	ldrh	tmp1w, [src, #-2]! +	strh	tmp1w, [dst, #-2]! +3: +	tbz	count, #0, .Lexitfunc +	ldrb	tmp1w, [src, #-1] +	strb	tmp1w, [dst, #-1] + +.Lexitfunc: +	ret + +.Lcpy_over64: +	subs	count, count, #128 +	b.ge	.Lcpy_body_large +	/* +	* Less than 128 bytes to copy, so handle 64 bytes here and then jump +	* to the tail. +	*/ +	ldp	A_l, A_h, [src, #-16] +	stp	A_l, A_h, [dst, #-16] +	ldp	B_l, B_h, [src, #-32] +	ldp	C_l, C_h, [src, #-48] +	stp	B_l, B_h, [dst, #-32] +	stp	C_l, C_h, [dst, #-48] +	ldp	D_l, D_h, [src, #-64]! +	stp	D_l, D_h, [dst, #-64]! + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret + +	/* +	* Critical loop. Start at a new cache line boundary. Assuming +	* 64 bytes per line this ensures the entire loop is in one line. +	*/ +	.p2align	L1_CACHE_SHIFT +.Lcpy_body_large: +	/* pre-load 64 bytes data. */ +	ldp	A_l, A_h, [src, #-16] +	ldp	B_l, B_h, [src, #-32] +	ldp	C_l, C_h, [src, #-48] +	ldp	D_l, D_h, [src, #-64]! +1: +	/* +	* interlace the load of next 64 bytes data block with store of the last +	* loaded 64 bytes data. +	*/ +	stp	A_l, A_h, [dst, #-16] +	ldp	A_l, A_h, [src, #-16] +	stp	B_l, B_h, [dst, #-32] +	ldp	B_l, B_h, [src, #-32] +	stp	C_l, C_h, [dst, #-48] +	ldp	C_l, C_h, [src, #-48] +	stp	D_l, D_h, [dst, #-64]! +	ldp	D_l, D_h, [src, #-64]! +	subs	count, count, #64 +	b.ge	1b +	stp	A_l, A_h, [dst, #-16] +	stp	B_l, B_h, [dst, #-32] +	stp	C_l, C_h, [dst, #-48] +	stp	D_l, D_h, [dst, #-64]! + +	tst	count, #0x3f +	b.ne	.Ltail63 +	ret  ENDPROC(memmove) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index 87e4a68fbbb..7c72dfd36b6 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -1,5 +1,13 @@  /*   * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@  #include <linux/linkage.h>  #include <asm/assembler.h> +#include <asm/cache.h>  /*   * Fill in the buffer with character c (alignment handled by the hardware) @@ -27,27 +36,181 @@   * Returns:   *	x0 - buf   */ + +dstin		.req	x0 +val		.req	w1 +count		.req	x2 +tmp1		.req	x3 +tmp1w		.req	w3 +tmp2		.req	x4 +tmp2w		.req	w4 +zva_len_x	.req	x5 +zva_len		.req	w5 +zva_bits_x	.req	x6 + +A_l		.req	x7 +A_lw		.req	w7 +dst		.req	x8 +tmp3w		.req	w9 +tmp3		.req	x9 +  ENTRY(memset) -	mov	x4, x0 -	and	w1, w1, #0xff -	orr	w1, w1, w1, lsl #8 -	orr	w1, w1, w1, lsl #16 -	orr	x1, x1, x1, lsl #32 -	subs	x2, x2, #8 -	b.mi	2f -1:	str	x1, [x4], #8 -	subs	x2, x2, #8 -	b.pl	1b -2:	adds	x2, x2, #4 -	b.mi	3f -	sub	x2, x2, #4 -	str	w1, [x4], #4 -3:	adds	x2, x2, #2 -	b.mi	4f -	sub	x2, x2, #2 -	strh	w1, [x4], #2 -4:	adds	x2, x2, #1 -	b.mi	5f -	strb	w1, [x4] -5:	ret +	mov	dst, dstin	/* Preserve return value.  */ +	and	A_lw, val, #255 +	orr	A_lw, A_lw, A_lw, lsl #8 +	orr	A_lw, A_lw, A_lw, lsl #16 +	orr	A_l, A_l, A_l, lsl #32 + +	cmp	count, #15 +	b.hi	.Lover16_proc +	/*All store maybe are non-aligned..*/ +	tbz	count, #3, 1f +	str	A_l, [dst], #8 +1: +	tbz	count, #2, 2f +	str	A_lw, [dst], #4 +2: +	tbz	count, #1, 3f +	strh	A_lw, [dst], #2 +3: +	tbz	count, #0, 4f +	strb	A_lw, [dst] +4: +	ret + +.Lover16_proc: +	/*Whether  the start address is aligned with 16.*/ +	neg	tmp2, dst +	ands	tmp2, tmp2, #15 +	b.eq	.Laligned +/* +* The count is not less than 16, we can use stp to store the start 16 bytes, +* then adjust the dst aligned with 16.This process will make the current +* memory address at alignment boundary. +*/ +	stp	A_l, A_l, [dst] /*non-aligned store..*/ +	/*make the dst aligned..*/ +	sub	count, count, tmp2 +	add	dst, dst, tmp2 + +.Laligned: +	cbz	A_l, .Lzero_mem + +.Ltail_maybe_long: +	cmp	count, #64 +	b.ge	.Lnot_short +.Ltail63: +	ands	tmp1, count, #0x30 +	b.eq	3f +	cmp	tmp1w, #0x20 +	b.eq	1f +	b.lt	2f +	stp	A_l, A_l, [dst], #16 +1: +	stp	A_l, A_l, [dst], #16 +2: +	stp	A_l, A_l, [dst], #16 +/* +* The last store length is less than 16,use stp to write last 16 bytes. +* It will lead some bytes written twice and the access is non-aligned. +*/ +3: +	ands	count, count, #15 +	cbz	count, 4f +	add	dst, dst, count +	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */ +4: +	ret + +	/* +	* Critical loop. Start at a new cache line boundary. Assuming +	* 64 bytes per line, this ensures the entire loop is in one line. +	*/ +	.p2align	L1_CACHE_SHIFT +.Lnot_short: +	sub	dst, dst, #16/* Pre-bias.  */ +	sub	count, count, #64 +1: +	stp	A_l, A_l, [dst, #16] +	stp	A_l, A_l, [dst, #32] +	stp	A_l, A_l, [dst, #48] +	stp	A_l, A_l, [dst, #64]! +	subs	count, count, #64 +	b.ge	1b +	tst	count, #0x3f +	add	dst, dst, #16 +	b.ne	.Ltail63 +.Lexitfunc: +	ret + +	/* +	* For zeroing memory, check to see if we can use the ZVA feature to +	* zero entire 'cache' lines. +	*/ +.Lzero_mem: +	cmp	count, #63 +	b.le	.Ltail63 +	/* +	* For zeroing small amounts of memory, it's not worth setting up +	* the line-clear code. +	*/ +	cmp	count, #128 +	b.lt	.Lnot_short /*count is at least  128 bytes*/ + +	mrs	tmp1, dczid_el0 +	tbnz	tmp1, #4, .Lnot_short +	mov	tmp3w, #4 +	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */ +	lsl	zva_len, tmp3w, zva_len + +	ands	tmp3w, zva_len, #63 +	/* +	* ensure the zva_len is not less than 64. +	* It is not meaningful to use ZVA if the block size is less than 64. +	*/ +	b.ne	.Lnot_short +.Lzero_by_line: +	/* +	* Compute how far we need to go to become suitably aligned. We're +	* already at quad-word alignment. +	*/ +	cmp	count, zva_len_x +	b.lt	.Lnot_short		/* Not enough to reach alignment.  */ +	sub	zva_bits_x, zva_len_x, #1 +	neg	tmp2, dst +	ands	tmp2, tmp2, zva_bits_x +	b.eq	2f			/* Already aligned.  */ +	/* Not aligned, check that there's enough to copy after alignment.*/ +	sub	tmp1, count, tmp2 +	/* +	* grantee the remain length to be ZVA is bigger than 64, +	* avoid to make the 2f's process over mem range.*/ +	cmp	tmp1, #64 +	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */ +	b.lt	.Lnot_short +	/* +	* We know that there's at least 64 bytes to zero and that it's safe +	* to overrun by 64 bytes. +	*/ +	mov	count, tmp1 +1: +	stp	A_l, A_l, [dst] +	stp	A_l, A_l, [dst, #16] +	stp	A_l, A_l, [dst, #32] +	subs	tmp2, tmp2, #64 +	stp	A_l, A_l, [dst, #48] +	add	dst, dst, #64 +	b.ge	1b +	/* We've overrun a bit, so adjust dst downwards.*/ +	add	dst, dst, tmp2 +2: +	sub	count, count, zva_len_x +3: +	dc	zva, dst +	add	dst, dst, zva_len_x +	subs	count, count, zva_len_x +	b.ge	3b +	ands	count, count, zva_bits_x +	b.ne	.Ltail_maybe_long +	ret  ENDPROC(memset) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 00000000000..42f828b06c5 --- /dev/null +++ b/arch/arm64/lib/strcmp.S @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + *	x0 - const string 1 pointer + *    x1 - const string 2 pointer + * Returns: + * x0 - an integer less than, equal to, or greater than zero + * if  s1  is  found, respectively, to be less than, to match, + * or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result.  */ +src1		.req	x0 +src2		.req	x1 +result		.req	x0 + +/* Internal variables.  */ +data1		.req	x2 +data1w		.req	w2 +data2		.req	x3 +data2w		.req	w3 +has_nul		.req	x4 +diff		.req	x5 +syndrome	.req	x6 +tmp1		.req	x7 +tmp2		.req	x8 +tmp3		.req	x9 +zeroones	.req	x10 +pos		.req	x11 + +ENTRY(strcmp) +	eor	tmp1, src1, src2 +	mov	zeroones, #REP8_01 +	tst	tmp1, #7 +	b.ne	.Lmisaligned8 +	ands	tmp1, src1, #7 +	b.ne	.Lmutual_align + +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +.Lloop_aligned: +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +.Lstart_realigned: +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2	/* Non-zero if differences found.  */ +	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */ +	orr	syndrome, diff, has_nul +	cbz	syndrome, .Lloop_aligned +	b	.Lcal_cmpresult + +.Lmutual_align: +	/* +	* Sources are mutually aligned, but are not currently at an +	* alignment boundary.  Round down the addresses and then mask off +	* the bytes that preceed the start point. +	*/ +	bic	src1, src1, #7 +	bic	src2, src2, #7 +	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ +	ldr	data1, [src1], #8 +	neg	tmp1, tmp1		/* Bits to alignment -64.  */ +	ldr	data2, [src2], #8 +	mov	tmp2, #~0 +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ + +	orr	data1, data1, tmp2 +	orr	data2, data2, tmp2 +	b	.Lstart_realigned + +.Lmisaligned8: +	/* +	* Get the align offset length to compare per byte first. +	* After this process, one string's address will be aligned. +	*/ +	and	tmp1, src1, #7 +	neg	tmp1, tmp1 +	add	tmp1, tmp1, #8 +	and	tmp2, src2, #7 +	neg	tmp2, tmp2 +	add	tmp2, tmp2, #8 +	subs	tmp3, tmp1, tmp2 +	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */ +.Ltinycmp: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	pos, pos, #1 +	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */ +	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ +	b.eq	.Ltinycmp +	cbnz	pos, 1f /*find the null or unequal...*/ +	cmp	data1w, #1 +	ccmp	data1w, data2w, #0, cs +	b.eq	.Lstart_align /*the last bytes are equal....*/ +1: +	sub	result, data1, data2 +	ret + +.Lstart_align: +	ands	xzr, src1, #7 +	b.eq	.Lrecal_offset +	/*process more leading bytes to make str1 aligned...*/ +	add	src1, src1, tmp3 +	add	src2, src2, tmp3 +	/*load 8 bytes from aligned str1 and non-aligned str2..*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 + +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bic	has_nul, tmp1, tmp2 +	eor	diff, data1, data2 /* Non-zero if differences found.  */ +	orr	syndrome, diff, has_nul +	cbnz	syndrome, .Lcal_cmpresult +	/*How far is the current str2 from the alignment boundary...*/ +	and	tmp3, tmp3, #7 +.Lrecal_offset: +	neg	pos, tmp3 +.Lloopcmp_proc: +	/* +	* Divide the eight bytes into two parts. First,backwards the src2 +	* to an alignment boundary,load eight bytes from the SRC2 alignment +	* boundary,then compare with the relative bytes from SRC1. +	* If all 8 bytes are equal,then start the second part's comparison. +	* Otherwise finish the comparison. +	* This special handle can garantee all the accesses are in the +	* thread/task space in avoid to overrange access. +	*/ +	ldr	data1, [src1,pos] +	ldr	data2, [src2,pos] +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bic	has_nul, tmp1, tmp2 +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	orr	syndrome, diff, has_nul +	cbnz	syndrome, .Lcal_cmpresult + +	/*The second part process*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bic	has_nul, tmp1, tmp2 +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	orr	syndrome, diff, has_nul +	cbz	syndrome, .Lloopcmp_proc + +.Lcal_cmpresult: +	/* +	* reversed the byte-order as big-endian,then CLZ can find the most +	* significant zero bits. +	*/ +CPU_LE( rev	syndrome, syndrome ) +CPU_LE( rev	data1, data1 ) +CPU_LE( rev	data2, data2 ) + +	/* +	* For big-endian we cannot use the trick with the syndrome value +	* as carry-propagation can corrupt the upper bits if the trailing +	* bytes in the string contain 0x01. +	* However, if there is no NUL byte in the dword, we can generate +	* the result directly.  We ca not just subtract the bytes as the +	* MSB might be significant. +	*/ +CPU_BE( cbnz	has_nul, 1f ) +CPU_BE( cmp	data1, data2 ) +CPU_BE( cset	result, ne ) +CPU_BE( cneg	result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) +	/*Re-compute the NUL-byte detection, using a byte-reversed value. */ +CPU_BE(	rev	tmp3, data1 ) +CPU_BE(	sub	tmp1, tmp3, zeroones ) +CPU_BE(	orr	tmp2, tmp3, #REP8_7f ) +CPU_BE(	bic	has_nul, tmp1, tmp2 ) +CPU_BE(	rev	has_nul, has_nul ) +CPU_BE(	orr	syndrome, diff, has_nul ) + +	clz	pos, syndrome +	/* +	* The MS-non-zero bit of the syndrome marks either the first bit +	* that is different, or the top bit of the first zero byte. +	* Shifting left now will bring the critical information into the +	* top bits. +	*/ +	lsl	data1, data1, pos +	lsl	data2, data2, pos +	/* +	* But we need to zero-extend (char is unsigned) the value and then +	* perform a signed 32-bit subtraction. +	*/ +	lsr	data1, data1, #56 +	sub	result, data1, data2, lsr #56 +	ret +ENDPROC(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 00000000000..987b68b9ce4 --- /dev/null +++ b/arch/arm64/lib/strlen.S @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * calculate the length of a string + * + * Parameters: + *	x0 - const string pointer + * Returns: + *	x0 - the return length of specific string + */ + +/* Arguments and results.  */ +srcin		.req	x0 +len		.req	x0 + +/* Locals and temporaries.  */ +src		.req	x1 +data1		.req	x2 +data2		.req	x3 +data2a		.req	x4 +has_nul1	.req	x5 +has_nul2	.req	x6 +tmp1		.req	x7 +tmp2		.req	x8 +tmp3		.req	x9 +tmp4		.req	x10 +zeroones	.req	x11 +pos		.req	x12 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strlen) +	mov	zeroones, #REP8_01 +	bic	src, srcin, #15 +	ands	tmp1, srcin, #15 +	b.ne	.Lmisaligned +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +	/* +	* The inner loop deals with two Dwords at a time. This has a +	* slightly higher start-up cost, but we should win quite quickly, +	* especially on cores with a high number of issue slots per +	* cycle, as we get much better parallelism out of the operations. +	*/ +.Lloop: +	ldp	data1, data2, [src], #16 +.Lrealigned: +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	sub	tmp3, data2, zeroones +	orr	tmp4, data2, #REP8_7f +	bic	has_nul1, tmp1, tmp2 +	bics	has_nul2, tmp3, tmp4 +	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */ +	b.eq	.Lloop + +	sub	len, src, srcin +	cbz	has_nul1, .Lnul_in_data2 +CPU_BE(	mov	data2, data1 )	/*prepare data to re-calculate the syndrome*/ +	sub	len, len, #8 +	mov	has_nul2, has_nul1 +.Lnul_in_data2: +	/* +	* For big-endian, carry propagation (if the final byte in the +	* string is 0x01) means we cannot use has_nul directly.  The +	* easiest way to get the correct byte is to byte-swap the data +	* and calculate the syndrome a second time. +	*/ +CPU_BE( rev	data2, data2 ) +CPU_BE( sub	tmp1, data2, zeroones ) +CPU_BE( orr	tmp2, data2, #REP8_7f ) +CPU_BE( bic	has_nul2, tmp1, tmp2 ) + +	sub	len, len, #8 +	rev	has_nul2, has_nul2 +	clz	pos, has_nul2 +	add	len, len, pos, lsr #3		/* Bits to bytes.  */ +	ret + +.Lmisaligned: +	cmp	tmp1, #8 +	neg	tmp1, tmp1 +	ldp	data1, data2, [src], #16 +	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ +	mov	tmp2, #~0 +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ + +	orr	data1, data1, tmp2 +	orr	data2a, data2, tmp2 +	csinv	data1, data1, xzr, le +	csel	data2, data2, data2a, le +	b	.Lrealigned +ENDPROC(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 00000000000..0224cf5a553 --- /dev/null +++ b/arch/arm64/lib/strncmp.S @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + *  x0 - const string 1 pointer + *  x1 - const string 2 pointer + *  x2 - the maximal length to be compared + * Returns: + *  x0 - an integer less than, equal to, or greater than zero if s1 is found, + *     respectively, to be less than, to match, or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result.  */ +src1		.req	x0 +src2		.req	x1 +limit		.req	x2 +result		.req	x0 + +/* Internal variables.  */ +data1		.req	x3 +data1w		.req	w3 +data2		.req	x4 +data2w		.req	w4 +has_nul		.req	x5 +diff		.req	x6 +syndrome	.req	x7 +tmp1		.req	x8 +tmp2		.req	x9 +tmp3		.req	x10 +zeroones	.req	x11 +pos		.req	x12 +limit_wd	.req	x13 +mask		.req	x14 +endloop		.req	x15 + +ENTRY(strncmp) +	cbz	limit, .Lret0 +	eor	tmp1, src1, src2 +	mov	zeroones, #REP8_01 +	tst	tmp1, #7 +	b.ne	.Lmisaligned8 +	ands	tmp1, src1, #7 +	b.ne	.Lmutual_align +	/* Calculate the number of full and partial words -1.  */ +	/* +	* when limit is mulitply of 8, if not sub 1, +	* the judgement of last dword will wrong. +	*/ +	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */ +	lsr	limit_wd, limit_wd, #3  /* Convert to Dwords.  */ + +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +.Lloop_aligned: +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +.Lstart_realigned: +	subs	limit_wd, limit_wd, #1 +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, pl  /* Last Dword or differences.*/ +	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */ +	ccmp	endloop, #0, #0, eq +	b.eq	.Lloop_aligned + +	/*Not reached the limit, must have found the end or a diff.  */ +	tbz	limit_wd, #63, .Lnot_limit + +	/* Limit % 8 == 0 => all bytes significant.  */ +	ands	limit, limit, #7 +	b.eq	.Lnot_limit + +	lsl	limit, limit, #3    /* Bits -> bytes.  */ +	mov	mask, #~0 +CPU_BE( lsr	mask, mask, limit ) +CPU_LE( lsl	mask, mask, limit ) +	bic	data1, data1, mask +	bic	data2, data2, mask + +	/* Make sure that the NUL byte is marked in the syndrome.  */ +	orr	has_nul, has_nul, mask + +.Lnot_limit: +	orr	syndrome, diff, has_nul +	b	.Lcal_cmpresult + +.Lmutual_align: +	/* +	* Sources are mutually aligned, but are not currently at an +	* alignment boundary.  Round down the addresses and then mask off +	* the bytes that precede the start point. +	* We also need to adjust the limit calculations, but without +	* overflowing if the limit is near ULONG_MAX. +	*/ +	bic	src1, src1, #7 +	bic	src2, src2, #7 +	ldr	data1, [src1], #8 +	neg	tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */ +	ldr	data2, [src2], #8 +	mov	tmp2, #~0 +	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */ +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */ + +	and	tmp3, limit_wd, #7 +	lsr	limit_wd, limit_wd, #3 +	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ +	add	limit, limit, tmp1 +	add	tmp3, tmp3, tmp1 +	orr	data1, data1, tmp2 +	orr	data2, data2, tmp2 +	add	limit_wd, limit_wd, tmp3, lsr #3 +	b	.Lstart_realigned + +/*when src1 offset is not equal to src2 offset...*/ +.Lmisaligned8: +	cmp	limit, #8 +	b.lo	.Ltiny8proc /*limit < 8... */ +	/* +	* Get the align offset length to compare per byte first. +	* After this process, one string's address will be aligned.*/ +	and	tmp1, src1, #7 +	neg	tmp1, tmp1 +	add	tmp1, tmp1, #8 +	and	tmp2, src2, #7 +	neg	tmp2, tmp2 +	add	tmp2, tmp2, #8 +	subs	tmp3, tmp1, tmp2 +	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */ +	/* +	* Here, limit is not less than 8, so directly run .Ltinycmp +	* without checking the limit.*/ +	sub	limit, limit, pos +.Ltinycmp: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	pos, pos, #1 +	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */ +	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ +	b.eq	.Ltinycmp +	cbnz	pos, 1f /*find the null or unequal...*/ +	cmp	data1w, #1 +	ccmp	data1w, data2w, #0, cs +	b.eq	.Lstart_align /*the last bytes are equal....*/ +1: +	sub	result, data1, data2 +	ret + +.Lstart_align: +	lsr	limit_wd, limit, #3 +	cbz	limit_wd, .Lremain8 +	/*process more leading bytes to make str1 aligned...*/ +	ands	xzr, src1, #7 +	b.eq	.Lrecal_offset +	add	src1, src1, tmp3	/*tmp3 is positive in this branch.*/ +	add	src2, src2, tmp3 +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 + +	sub	limit, limit, tmp3 +	lsr	limit_wd, limit, #3 +	subs	limit_wd, limit_wd, #1 + +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ +	bics	has_nul, tmp1, tmp2 +	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ +	b.ne	.Lunequal_proc +	/*How far is the current str2 from the alignment boundary...*/ +	and	tmp3, tmp3, #7 +.Lrecal_offset: +	neg	pos, tmp3 +.Lloopcmp_proc: +	/* +	* Divide the eight bytes into two parts. First,backwards the src2 +	* to an alignment boundary,load eight bytes from the SRC2 alignment +	* boundary,then compare with the relative bytes from SRC1. +	* If all 8 bytes are equal,then start the second part's comparison. +	* Otherwise finish the comparison. +	* This special handle can garantee all the accesses are in the +	* thread/task space in avoid to overrange access. +	*/ +	ldr	data1, [src1,pos] +	ldr	data2, [src2,pos] +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */ +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, eq +	cbnz	endloop, .Lunequal_proc + +	/*The second part process*/ +	ldr	data1, [src1], #8 +	ldr	data2, [src2], #8 +	subs	limit_wd, limit_wd, #1 +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	eor	diff, data1, data2  /* Non-zero if differences found.  */ +	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ +	bics	has_nul, tmp1, tmp2 +	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ +	b.eq	.Lloopcmp_proc + +.Lunequal_proc: +	orr	syndrome, diff, has_nul +	cbz	syndrome, .Lremain8 +.Lcal_cmpresult: +	/* +	* reversed the byte-order as big-endian,then CLZ can find the most +	* significant zero bits. +	*/ +CPU_LE( rev	syndrome, syndrome ) +CPU_LE( rev	data1, data1 ) +CPU_LE( rev	data2, data2 ) +	/* +	* For big-endian we cannot use the trick with the syndrome value +	* as carry-propagation can corrupt the upper bits if the trailing +	* bytes in the string contain 0x01. +	* However, if there is no NUL byte in the dword, we can generate +	* the result directly.  We can't just subtract the bytes as the +	* MSB might be significant. +	*/ +CPU_BE( cbnz	has_nul, 1f ) +CPU_BE( cmp	data1, data2 ) +CPU_BE( cset	result, ne ) +CPU_BE( cneg	result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) +	/* Re-compute the NUL-byte detection, using a byte-reversed value.*/ +CPU_BE( rev	tmp3, data1 ) +CPU_BE( sub	tmp1, tmp3, zeroones ) +CPU_BE( orr	tmp2, tmp3, #REP8_7f ) +CPU_BE( bic	has_nul, tmp1, tmp2 ) +CPU_BE( rev	has_nul, has_nul ) +CPU_BE( orr	syndrome, diff, has_nul ) +	/* +	* The MS-non-zero bit of the syndrome marks either the first bit +	* that is different, or the top bit of the first zero byte. +	* Shifting left now will bring the critical information into the +	* top bits. +	*/ +	clz	pos, syndrome +	lsl	data1, data1, pos +	lsl	data2, data2, pos +	/* +	* But we need to zero-extend (char is unsigned) the value and then +	* perform a signed 32-bit subtraction. +	*/ +	lsr	data1, data1, #56 +	sub	result, data1, data2, lsr #56 +	ret + +.Lremain8: +	/* Limit % 8 == 0 => all bytes significant.  */ +	ands	limit, limit, #7 +	b.eq	.Lret0 +.Ltiny8proc: +	ldrb	data1w, [src1], #1 +	ldrb	data2w, [src2], #1 +	subs	limit, limit, #1 + +	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */ +	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ +	b.eq	.Ltiny8proc +	sub	result, data1, data2 +	ret + +.Lret0: +	mov	result, #0 +	ret +ENDPROC(strncmp) diff --git a/arch/arm64/lib/strncpy_from_user.S b/arch/arm64/lib/strncpy_from_user.S deleted file mode 100644 index 56e448a831a..00000000000 --- a/arch/arm64/lib/strncpy_from_user.S +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Based on arch/arm/lib/strncpy_from_user.S - * - * Copyright (C) 1995-2000 Russell King - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/errno.h> - -	.text -	.align	5 - -/* - * Copy a string from user space to kernel space. - *  x0 = dst, x1 = src, x2 = byte length - * returns the number of characters copied (strlen of copied string), - *  -EFAULT on exception, or "len" if we fill the whole buffer - */ -ENTRY(__strncpy_from_user) -	mov	x4, x1 -1:	subs	x2, x2, #1 -	bmi	2f -USER(9f, ldrb	w3, [x1], #1	) -	strb	w3, [x0], #1 -	cbnz	w3, 1b -	sub	x1, x1, #1	// take NUL character out of count -2:	sub	x0, x1, x4 -	ret -ENDPROC(__strncpy_from_user) - -	.section .fixup,"ax" -	.align	0 -9:	strb	wzr, [x0]	// null terminate -	mov	x0, #-EFAULT -	ret -	.previous diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 00000000000..2ca665711bf --- /dev/null +++ b/arch/arm64/lib/strnlen.S @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * determine the length of a fixed-size string + * + * Parameters: + *	x0 - const string pointer + *	x1 - maximal string length + * Returns: + *	x0 - the return length of specific string + */ + +/* Arguments and results.  */ +srcin		.req	x0 +len		.req	x0 +limit		.req	x1 + +/* Locals and temporaries.  */ +src		.req	x2 +data1		.req	x3 +data2		.req	x4 +data2a		.req	x5 +has_nul1	.req	x6 +has_nul2	.req	x7 +tmp1		.req	x8 +tmp2		.req	x9 +tmp3		.req	x10 +tmp4		.req	x11 +zeroones	.req	x12 +pos		.req	x13 +limit_wd	.req	x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strnlen) +	cbz	limit, .Lhit_limit +	mov	zeroones, #REP8_01 +	bic	src, srcin, #15 +	ands	tmp1, srcin, #15 +	b.ne	.Lmisaligned +	/* Calculate the number of full and partial words -1.  */ +	sub	limit_wd, limit, #1 /* Limit != 0, so no underflow.  */ +	lsr	limit_wd, limit_wd, #4  /* Convert to Qwords.  */ + +	/* +	* NUL detection works on the principle that (X - 1) & (~X) & 0x80 +	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +	* can be done in parallel across the entire word. +	*/ +	/* +	* The inner loop deals with two Dwords at a time.  This has a +	* slightly higher start-up cost, but we should win quite quickly, +	* especially on cores with a high number of issue slots per +	* cycle, as we get much better parallelism out of the operations. +	*/ +.Lloop: +	ldp	data1, data2, [src], #16 +.Lrealigned: +	sub	tmp1, data1, zeroones +	orr	tmp2, data1, #REP8_7f +	sub	tmp3, data2, zeroones +	orr	tmp4, data2, #REP8_7f +	bic	has_nul1, tmp1, tmp2 +	bic	has_nul2, tmp3, tmp4 +	subs	limit_wd, limit_wd, #1 +	orr	tmp1, has_nul1, has_nul2 +	ccmp	tmp1, #0, #0, pl    /* NZCV = 0000  */ +	b.eq	.Lloop + +	cbz	tmp1, .Lhit_limit   /* No null in final Qword.  */ + +	/* +	* We know there's a null in the final Qword. The easiest thing +	* to do now is work out the length of the string and return +	* MIN (len, limit). +	*/ +	sub	len, src, srcin +	cbz	has_nul1, .Lnul_in_data2 +CPU_BE( mov	data2, data1 )	/*perpare data to re-calculate the syndrome*/ + +	sub	len, len, #8 +	mov	has_nul2, has_nul1 +.Lnul_in_data2: +	/* +	* For big-endian, carry propagation (if the final byte in the +	* string is 0x01) means we cannot use has_nul directly.  The +	* easiest way to get the correct byte is to byte-swap the data +	* and calculate the syndrome a second time. +	*/ +CPU_BE( rev	data2, data2 ) +CPU_BE( sub	tmp1, data2, zeroones ) +CPU_BE( orr	tmp2, data2, #REP8_7f ) +CPU_BE( bic	has_nul2, tmp1, tmp2 ) + +	sub	len, len, #8 +	rev	has_nul2, has_nul2 +	clz	pos, has_nul2 +	add	len, len, pos, lsr #3       /* Bits to bytes.  */ +	cmp	len, limit +	csel	len, len, limit, ls     /* Return the lower value.  */ +	ret + +.Lmisaligned: +	/* +	* Deal with a partial first word. +	* We're doing two things in parallel here; +	* 1) Calculate the number of words (but avoiding overflow if +	* limit is near ULONG_MAX) - to do this we need to work out +	* limit + tmp1 - 1 as a 65-bit value before shifting it; +	* 2) Load and mask the initial data words - we force the bytes +	* before the ones we are interested in to 0xff - this ensures +	* early bytes will not hit any zero detection. +	*/ +	ldp	data1, data2, [src], #16 + +	sub	limit_wd, limit, #1 +	and	tmp3, limit_wd, #15 +	lsr	limit_wd, limit_wd, #4 + +	add	tmp3, tmp3, tmp1 +	add	limit_wd, limit_wd, tmp3, lsr #4 + +	neg	tmp4, tmp1 +	lsl	tmp4, tmp4, #3  /* Bytes beyond alignment -> bits.  */ + +	mov	tmp2, #~0 +	/* Big-endian.  Early bytes are at MSB.  */ +CPU_BE( lsl	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */ +	/* Little-endian.  Early bytes are at LSB.  */ +CPU_LE( lsr	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */ + +	cmp	tmp1, #8 + +	orr	data1, data1, tmp2 +	orr	data2a, data2, tmp2 + +	csinv	data1, data1, xzr, le +	csel	data2, data2, data2a, le +	b	.Lrealigned + +.Lhit_limit: +	mov	len, limit +	ret +ENDPROC(strnlen) diff --git a/arch/arm64/lib/strnlen_user.S b/arch/arm64/lib/strnlen_user.S deleted file mode 100644 index 7f7b176a564..00000000000 --- a/arch/arm64/lib/strnlen_user.S +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Based on arch/arm/lib/strnlen_user.S - * - * Copyright (C) 1995-2000 Russell King - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/errno.h> - -	.text -	.align	5 - -/* Prototype: unsigned long __strnlen_user(const char *str, long n) - * Purpose  : get length of a string in user memory - * Params   : str - address of string in user memory - * Returns  : length of string *including terminator* - *	      or zero on exception, or n if too long - */ -ENTRY(__strnlen_user) -	mov	x2, x0 -1:	subs	x1, x1, #1 -	b.mi	2f -USER(9f, ldrb	w3, [x0], #1	) -	cbnz	w3, 1b -2:	sub	x0, x0, x2 -	ret -ENDPROC(__strnlen_user) - -	.section .fixup,"ax" -	.align	0 -9:	mov	x0, #0 -	ret -	.previous  | 
