i386: move lib

Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 11:16:33 +0200
committer: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 11:16:33 +0200
commit: 44f0257fc316ff4b33aa3438dd8d891b7d6d72b9 (patch)
tree: c1a9a571db37d631489f18e1dfe5554874b19027 /arch/x86/lib
parent: da957e111bb0c189a4a3bf8a00caaecb59ed94ca (diff)
14 files changed, 2865 insertions, 0 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
new file mode 100644
index 00000000000..2d7d724a2a6
--- /dev/null
+++ b/arch/x86/lib/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/lib/Makefile_32
+else
+include ${srctree}/arch/x86_64/lib/Makefile_64
+endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
new file mode 100644
index 00000000000..98d1f1e2e2e
--- /dev/null
+++ b/arch/x86/lib/Makefile_32
@@ -0,0 +1,11 @@
+#
+# Makefile for i386-specific library files..
+#
+
+
+lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
+	bitops_32.o semaphore_32.o string_32.o
+
+lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
+
+obj-$(CONFIG_SMP)	+= msr-on-cpu.o
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
new file mode 100644
index 00000000000..afd0045595d
--- /dev/null
+++ b/arch/x86/lib/bitops_32.c
@@ -0,0 +1,70 @@
+#include <linux/bitops.h>
+#include <linux/module.h>
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_bit(const unsigned long *addr, int size, int offset)
+{
+	const unsigned long *p = addr + (offset >> 5);
+	int set = 0, bit = offset & 31, res;
+
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 32 bits:
+		 */
+		__asm__("bsfl %1,%0\n\t"
+			"jne 1f\n\t"
+			"movl $32, %0\n"
+			"1:"
+			: "=r" (set)
+			: "r" (*p >> bit));
+		if (set < (32 - bit))
+			return set + offset;
+		set = 32 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = find_first_bit (p, size - 32 * (p - addr));
+	return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_bit);
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_zero_bit(const unsigned long *addr, int size, int offset)
+{
+	const unsigned long *p = addr + (offset >> 5);
+	int set = 0, bit = offset & 31, res;
+
+	if (bit) {
+		/*
+		 * Look for zero in the first 32 bits.
+		 */
+		__asm__("bsfl %1,%0\n\t"
+			"jne 1f\n\t"
+			"movl $32, %0\n"
+			"1:"
+			: "=r" (set)
+			: "r" (~(*p >> bit)));
+		if (set < (32 - bit))
+			return set + offset;
+		set = 32 - bit;
+		p++;
+	}
+	/*
+	 * No zero yet, search remaining full bytes for a zero
+	 */
+	res = find_first_zero_bit(p, size - 32 * (p - addr));
+	return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
new file mode 100644
index 00000000000..adbccd0bbb7
--- /dev/null
+++ b/arch/x86/lib/checksum_32.S
@@ -0,0 +1,546 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+				
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*	
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+		
+.text
+		
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+	  /*		
+	   * Experiments with Ethernet and SLIP connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
+	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+	   * alignment for the unrolled loop.
+	   */		
+ENTRY(csum_partial)
+	CFI_STARTPROC
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg: unsigned char *buff
+	testl $3, %esi		# Check alignment.
+	jz 2f			# Jump if alignment is ok.
+	testl $1, %esi		# Check alignment.
+	jz 10f			# Jump if alignment is boundary of 2bytes.
+
+	# buf is odd
+	dec %ecx
+	jl 8f
+	movzbl (%esi), %ebx
+	adcl %ebx, %eax
+	roll $8, %eax
+	inc %esi
+	testl $2, %esi
+	jz 2f
+10:
+	subl $2, %ecx		# Alignment uses up two bytes.
+	jae 1f			# Jump if we had at least two bytes.
+	addl $2, %ecx		# ecx was < 2.  Deal with it.
+	jmp 4f
+1:	movw (%esi), %bx
+	addl $2, %esi
+	addw %bx, %ax
+	adcl $0, %eax
+2:
+	movl %ecx, %edx
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+1:	movl (%esi), %ebx
+	adcl %ebx, %eax
+	movl 4(%esi), %ebx
+	adcl %ebx, %eax
+	movl 8(%esi), %ebx
+	adcl %ebx, %eax
+	movl 12(%esi), %ebx
+	adcl %ebx, %eax
+	movl 16(%esi), %ebx
+	adcl %ebx, %eax
+	movl 20(%esi), %ebx
+	adcl %ebx, %eax
+	movl 24(%esi), %ebx
+	adcl %ebx, %eax
+	movl 28(%esi), %ebx
+	adcl %ebx, %eax
+	lea 32(%esi), %esi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx		# This clears CF
+3:	adcl (%esi), %eax
+	lea 4(%esi), %esi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+	movw (%esi),%cx
+	leal 2(%esi),%esi
+	je 6f
+	shll $16,%ecx
+5:	movb (%esi),%cl
+6:	addl %ecx,%eax
+	adcl $0, %eax 
+7:	
+	testl $1, 12(%esp)
+	jz 8f
+	roll $8, %eax
+8:
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	ret
+	CFI_ENDPROC
+ENDPROC(csum_partial)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+ENTRY(csum_partial)
+	CFI_STARTPROC
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
+
+	testl $3, %esi         
+	jnz 25f                 
+10:
+	movl %ecx, %edx
+	movl %ecx, %ebx
+	andl $0x7c, %ebx
+	shrl $7, %ecx
+	addl %ebx,%esi
+	shrl $2, %ebx  
+	negl %ebx
+	lea 45f(%ebx,%ebx,2), %ebx
+	testl %esi, %esi
+	jmp *%ebx
+
+	# Handle 2-byte-aligned regions
+20:	addw (%esi), %ax
+	lea 2(%esi), %esi
+	adcl $0, %eax
+	jmp 10b
+25:
+	testl $1, %esi         
+	jz 30f                 
+	# buf is odd
+	dec %ecx
+	jl 90f
+	movzbl (%esi), %ebx
+	addl %ebx, %eax
+	adcl $0, %eax
+	roll $8, %eax
+	inc %esi
+	testl $2, %esi
+	jz 10b
+
+30:	subl $2, %ecx          
+	ja 20b                 
+	je 32f
+	addl $2, %ecx
+	jz 80f
+	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
+	addl %ebx, %eax
+	adcl $0, %eax
+	jmp 80f
+32:
+	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
+	adcl $0, %eax
+	jmp 80f
+
+40: 
+	addl -128(%esi), %eax
+	adcl -124(%esi), %eax
+	adcl -120(%esi), %eax
+	adcl -116(%esi), %eax   
+	adcl -112(%esi), %eax   
+	adcl -108(%esi), %eax
+	adcl -104(%esi), %eax
+	adcl -100(%esi), %eax
+	adcl -96(%esi), %eax
+	adcl -92(%esi), %eax
+	adcl -88(%esi), %eax
+	adcl -84(%esi), %eax
+	adcl -80(%esi), %eax
+	adcl -76(%esi), %eax
+	adcl -72(%esi), %eax
+	adcl -68(%esi), %eax
+	adcl -64(%esi), %eax     
+	adcl -60(%esi), %eax     
+	adcl -56(%esi), %eax     
+	adcl -52(%esi), %eax   
+	adcl -48(%esi), %eax   
+	adcl -44(%esi), %eax
+	adcl -40(%esi), %eax
+	adcl -36(%esi), %eax
+	adcl -32(%esi), %eax
+	adcl -28(%esi), %eax
+	adcl -24(%esi), %eax
+	adcl -20(%esi), %eax
+	adcl -16(%esi), %eax
+	adcl -12(%esi), %eax
+	adcl -8(%esi), %eax
+	adcl -4(%esi), %eax
+45:
+	lea 128(%esi), %esi
+	adcl $0, %eax
+	dec %ecx
+	jge 40b
+	movl %edx, %ecx
+50:	andl $3, %ecx
+	jz 80f
+
+	# Handle the last 1-3 bytes without jumping
+	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
+	movl $0xffffff,%ebx	# by the shll and shrl instructions
+	shll $3,%ecx
+	shrl %cl,%ebx
+	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
+	addl %ebx,%eax
+	adcl $0,%eax
+80: 
+	testl $1, 12(%esp)
+	jz 90f
+	roll $8, %eax
+90: 
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	ret
+	CFI_ENDPROC
+ENDPROC(csum_partial)
+				
+#endif
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *	  them all but there's no guarantee.
+ */
+
+#define SRC(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6001f	;	\
+	.previous
+
+#define DST(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6002f	;	\
+	.previous
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+#define ARGBASE 16		
+#define FP		12
+		
+ENTRY(csum_partial_copy_generic)
+	CFI_STARTPROC
+	subl  $4,%esp	
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl ARGBASE+16(%esp),%eax	# sum
+	movl ARGBASE+12(%esp),%ecx	# len
+	movl ARGBASE+4(%esp),%esi	# src
+	movl ARGBASE+8(%esp),%edi	# dst
+
+	testl $2, %edi			# Check alignment. 
+	jz 2f				# Jump if alignment is ok.
+	subl $2, %ecx			# Alignment uses up two bytes.
+	jae 1f				# Jump if we had at least two bytes.
+	addl $2, %ecx			# ecx was < 2.  Deal with it.
+	jmp 4f
+SRC(1:	movw (%esi), %bx	)
+	addl $2, %esi
+DST(	movw %bx, (%edi)	)
+	addl $2, %edi
+	addw %bx, %ax	
+	adcl $0, %eax
+2:
+	movl %ecx, FP(%esp)
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+SRC(1:	movl (%esi), %ebx	)
+SRC(	movl 4(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 4(%edi)	)
+
+SRC(	movl 8(%esi), %ebx	)
+SRC(	movl 12(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 8(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 12(%edi)	)
+
+SRC(	movl 16(%esi), %ebx 	)
+SRC(	movl 20(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 16(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 20(%edi)	)
+
+SRC(	movl 24(%esi), %ebx	)
+SRC(	movl 28(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 24(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 28(%edi)	)
+
+	lea 32(%esi), %esi
+	lea 32(%edi), %edi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl FP(%esp), %edx
+	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx			# This clears CF
+SRC(3:	movl (%esi), %ebx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	lea 4(%esi), %esi
+	lea 4(%edi), %edi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+SRC(	movw (%esi), %cx	)
+	leal 2(%esi), %esi
+DST(	movw %cx, (%edi)	)
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%ecx
+SRC(5:	movb (%esi), %cl	)
+DST(	movb %cl, (%edi)	)
+6:	addl %ecx, %eax
+	adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"							
+
+6001:
+	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
+	movl $-EFAULT, (%ebx)
+
+	# zero the complete destination - computing the rest
+	# is too much work 
+	movl ARGBASE+8(%esp), %edi	# dst
+	movl ARGBASE+12(%esp), %ecx	# len
+	xorl %eax,%eax
+	rep ; stosb
+
+	jmp 5000b
+
+6002:
+	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT,(%ebx)
+	jmp 5000b
+
+.previous
+
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	popl %edi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edi
+	popl %ecx			# equivalent to addl $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	ret	
+	CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	addl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	; 
+
+#define ROUND(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	adcl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	;
+
+#define ARGBASE 12
+		
+ENTRY(csum_partial_copy_generic)
+	CFI_STARTPROC
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	movl ARGBASE+4(%esp),%esi	#src
+	movl ARGBASE+8(%esp),%edi	#dst	
+	movl ARGBASE+12(%esp),%ecx	#len
+	movl ARGBASE+16(%esp),%eax	#sum
+#	movl %ecx, %edx  
+	movl %ecx, %ebx  
+	movl %esi, %edx
+	shrl $6, %ecx     
+	andl $0x3c, %ebx  
+	negl %ebx
+	subl %ebx, %esi  
+	subl %ebx, %edi  
+	lea  -1(%esi),%edx
+	andl $-32,%edx
+	lea 3f(%ebx,%ebx), %ebx
+	testl %esi, %esi 
+	jmp *%ebx
+1:	addl $64,%esi
+	addl $64,%edi 
+	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
+	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	
+	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	
+	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	
+	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	
+3:	adcl $0,%eax
+	addl $64, %edx
+	dec %ecx
+	jge 1b
+4:	movl ARGBASE+12(%esp),%edx	#len
+	andl $3, %edx
+	jz 7f
+	cmpl $2, %edx
+	jb 5f
+SRC(	movw (%esi), %dx         )
+	leal 2(%esi), %esi
+DST(	movw %dx, (%edi)         )
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%edx
+5:
+SRC(	movb (%esi), %dl         )
+DST(	movb %dl, (%edi)         )
+6:	addl %edx, %eax
+	adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	
+	movl $-EFAULT, (%ebx)
+	# zero the complete destination (computing the rest is too much work)
+	movl ARGBASE+8(%esp),%edi	# dst
+	movl ARGBASE+12(%esp),%ecx	# len
+	xorl %eax,%eax
+	rep; stosb
+	jmp 7b
+6002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT, (%ebx)
+	jmp  7b			
+.previous				
+
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	popl %edi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edi
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+				
+#undef ROUND
+#undef ROUND1		
+		
+#endif
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
new file mode 100644
index 00000000000..f6edb11364d
--- /dev/null
+++ b/arch/x86/lib/delay_32.c
@@ -0,0 +1,103 @@
+/*
+ *	Precise Delay Loops for i386
+ *
+ *	Copyright (C) 1993 Linus Torvalds
+ *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *	The __delay function must _NOT_ be inlined as its execution time
+ *	depends wildly on alignment on many x86 processors. The additional
+ *	jump magic is needed to get the timing stable on all the CPU's
+ *	we have to worry about.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include <asm/timer.h>
+
+#ifdef CONFIG_SMP
+# include <asm/smp.h>
+#endif
+
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+	int d0;
+
+	__asm__ __volatile__(
+		"\tjmp 1f\n"
+		".align 16\n"
+		"1:\tjmp 2f\n"
+		".align 16\n"
+		"2:\tdecl %0\n\tjns 2b"
+		:"=&a" (d0)
+		:"0" (loops));
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	rdtscl(bclock);
+	do {
+		rep_nop();
+		rdtscl(now);
+	} while ((now-bclock) < loops);
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+	delay_fn = delay_tsc;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (delay_fn == delay_tsc) {
+		rdtscl(*timer_val);
+		return 0;
+	}
+	return -1;
+}
+
+void __delay(unsigned long loops)
+{
+	delay_fn(loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+	int d0;
+
+	xloops *= 4;
+	__asm__("mull %0"
+		:"=d" (xloops), "=&a" (d0)
+		:"1" (xloops), "0"
+		(cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+
+	__delay(++xloops);
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
new file mode 100644
index 00000000000..6d84b53f12a
--- /dev/null
+++ b/arch/x86/lib/getuser_32.S
@@ -0,0 +1,78 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+
+
+/*
+ * __get_user_X
+ *
+ * Inputs:	%eax contains the address
+ *
+ * Outputs:	%eax is error code (0 or -EFAULT)
+ *		%edx contains zero-extended value
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+.text
+ENTRY(__get_user_1)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%edx)
+	cmpl TI_addr_limit(%edx),%eax
+	jae bad_get_user
+1:	movzbl (%eax),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+	CFI_STARTPROC
+	addl $1,%eax
+	jc bad_get_user
+	GET_THREAD_INFO(%edx)
+	cmpl TI_addr_limit(%edx),%eax
+	jae bad_get_user
+2:	movzwl -1(%eax),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+	CFI_STARTPROC
+	addl $3,%eax
+	jc bad_get_user
+	GET_THREAD_INFO(%edx)
+	cmpl TI_addr_limit(%edx),%eax
+	jae bad_get_user
+3:	movl -3(%eax),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+bad_get_user:
+	CFI_STARTPROC
+	xorl %edx,%edx
+	movl $-14,%eax
+	ret
+	CFI_ENDPROC
+END(bad_get_user)
+
+.section __ex_table,"a"
+	.long 1b,bad_get_user
+	.long 2b,bad_get_user
+	.long 3b,bad_get_user
+.previous
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
new file mode 100644
index 00000000000..8ac51b82a63
--- /dev/null
+++ b/arch/x86/lib/memcpy_32.c
@@ -0,0 +1,43 @@
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memcpy
+#undef memset
+
+void *memcpy(void *to, const void *from, size_t n)
+{
+#ifdef CONFIG_X86_USE_3DNOW
+	return __memcpy3d(to, from, n);
+#else
+	return __memcpy(to, from, n);
+#endif
+}
+EXPORT_SYMBOL(memcpy);
+
+void *memset(void *s, int c, size_t count)
+{
+	return __memset(s, c, count);
+}
+EXPORT_SYMBOL(memset);
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	int d0, d1, d2;
+
+	if (dest < src) {
+		memcpy(dest,src,n);
+	} else {
+		__asm__ __volatile__(
+			"std\n\t"
+			"rep\n\t"
+			"movsb\n\t"
+			"cld"
+			: "=&c" (d0), "=&S" (d1), "=&D" (d2)
+			:"0" (n),
+			 "1" (n-1+(const char *)src),
+			 "2" (n-1+(char *)dest)
+			:"memory");
+	}
+	return dest;
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
new file mode 100644
index 00000000000..28084d2e8dd
--- /dev/null
+++ b/arch/x86/lib/mmx_32.c
@@ -0,0 +1,403 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+
+#include <asm/i387.h>
+
+
+/*
+ *	MMX 3DNow! library helper functions
+ *
+ *	To do:
+ *	We can use MMX just for prefetch in IRQ's. This may be a win. 
+ *		(reported so on K6-III)
+ *	We should use a better code neutral filler for the short jump
+ *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ *	We also want to clobber the filler register so we don't get any
+ *		register forwarding stalls on the filler. 
+ *
+ *	Add *user handling. Checksums are not a win with MMX on any CPU
+ *	tested so far for any MMX solution figured.
+ *
+ *	22/09/2000 - Arjan van de Ven 
+ *		Improved for non-egineering-sample Athlons 
+ *
+ */
+ 
+void *_mmx_memcpy(void *to, const void *from, size_t len)
+{
+	void *p;
+	int i;
+
+	if (unlikely(in_interrupt()))
+		return __memcpy(to, from, len);
+
+	p = to;
+	i = len >> 6; /* len/64 */
+
+	kernel_fpu_begin();
+
+	__asm__ __volatile__ (
+		"1: prefetch (%0)\n"		/* This set is 28 bytes */
+		"   prefetch 64(%0)\n"
+		"   prefetch 128(%0)\n"
+		"   prefetch 192(%0)\n"
+		"   prefetch 256(%0)\n"
+		"2:  \n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from) );
+		
+	
+	for(; i>5; i--)
+	{
+		__asm__ __volatile__ (
+		"1:  prefetch 320(%0)\n"
+		"2:  movq (%0), %%mm0\n"
+		"  movq 8(%0), %%mm1\n"
+		"  movq 16(%0), %%mm2\n"
+		"  movq 24(%0), %%mm3\n"
+		"  movq %%mm0, (%1)\n"
+		"  movq %%mm1, 8(%1)\n"
+		"  movq %%mm2, 16(%1)\n"
+		"  movq %%mm3, 24(%1)\n"
+		"  movq 32(%0), %%mm0\n"
+		"  movq 40(%0), %%mm1\n"
+		"  movq 48(%0), %%mm2\n"
+		"  movq 56(%0), %%mm3\n"
+		"  movq %%mm0, 32(%1)\n"
+		"  movq %%mm1, 40(%1)\n"
+		"  movq %%mm2, 48(%1)\n"
+		"  movq %%mm3, 56(%1)\n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+
+	for(; i>0; i--)
+	{
+		__asm__ __volatile__ (
+		"  movq (%0), %%mm0\n"
+		"  movq 8(%0), %%mm1\n"
+		"  movq 16(%0), %%mm2\n"
+		"  movq 24(%0), %%mm3\n"
+		"  movq %%mm0, (%1)\n"
+		"  movq %%mm1, 8(%1)\n"
+		"  movq %%mm2, 16(%1)\n"
+		"  movq %%mm3, 24(%1)\n"
+		"  movq 32(%0), %%mm0\n"
+		"  movq 40(%0), %%mm1\n"
+		"  movq 48(%0), %%mm2\n"
+		"  movq 56(%0), %%mm3\n"
+		"  movq %%mm0, 32(%1)\n"
+		"  movq %%mm1, 40(%1)\n"
+		"  movq %%mm2, 48(%1)\n"
+		"  movq %%mm3, 56(%1)\n"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	__memcpy(to, from, len&63);
+	kernel_fpu_end();
+	return p;
+}
+
+#ifdef CONFIG_MK7
+
+/*
+ *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
+ *	other MMX using processors do not.
+ */
+
+static void fast_clear_page(void *page)
+{
+	int i;
+
+	kernel_fpu_begin();
+	
+	__asm__ __volatile__ (
+		"  pxor %%mm0, %%mm0\n" : :
+	);
+
+	for(i=0;i<4096/64;i++)
+	{
+		__asm__ __volatile__ (
+		"  movntq %%mm0, (%0)\n"
+		"  movntq %%mm0, 8(%0)\n"
+		"  movntq %%mm0, 16(%0)\n"
+		"  movntq %%mm0, 24(%0)\n"
+		"  movntq %%mm0, 32(%0)\n"
+		"  movntq %%mm0, 40(%0)\n"
+		"  movntq %%mm0, 48(%0)\n"
+		"  movntq %%mm0, 56(%0)\n"
+		: : "r" (page) : "memory");
+		page+=64;
+	}
+	/* since movntq is weakly-ordered, a "sfence" is needed to become
+	 * ordered again.
+	 */
+	__asm__ __volatile__ (
+		"  sfence \n" : :
+	);
+	kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+	int i;
+
+	kernel_fpu_begin();
+
+	/* maybe the prefetch stuff can go before the expensive fnsave...
+	 * but that is for later. -AV
+	 */
+	__asm__ __volatile__ (
+		"1: prefetch (%0)\n"
+		"   prefetch 64(%0)\n"
+		"   prefetch 128(%0)\n"
+		"   prefetch 192(%0)\n"
+		"   prefetch 256(%0)\n"
+		"2:  \n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from) );
+
+	for(i=0; i<(4096-320)/64; i++)
+	{
+		__asm__ __volatile__ (
+		"1: prefetch 320(%0)\n"
+		"2: movq (%0), %%mm0\n"
+		"   movntq %%mm0, (%1)\n"
+		"   movq 8(%0), %%mm1\n"
+		"   movntq %%mm1, 8(%1)\n"
+		"   movq 16(%0), %%mm2\n"
+		"   movntq %%mm2, 16(%1)\n"
+		"   movq 24(%0), %%mm3\n"
+		"   movntq %%mm3, 24(%1)\n"
+		"   movq 32(%0), %%mm4\n"
+		"   movntq %%mm4, 32(%1)\n"
+		"   movq 40(%0), %%mm5\n"
+		"   movntq %%mm5, 40(%1)\n"
+		"   movq 48(%0), %%mm6\n"
+		"   movntq %%mm6, 48(%1)\n"
+		"   movq 56(%0), %%mm7\n"
+		"   movntq %%mm7, 56(%1)\n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	for(i=(4096-320)/64; i<4096/64; i++)
+	{
+		__asm__ __volatile__ (
+		"2: movq (%0), %%mm0\n"
+		"   movntq %%mm0, (%1)\n"
+		"   movq 8(%0), %%mm1\n"
+		"   movntq %%mm1, 8(%1)\n"
+		"   movq 16(%0), %%mm2\n"
+		"   movntq %%mm2, 16(%1)\n"
+		"   movq 24(%0), %%mm3\n"
+		"   movntq %%mm3, 24(%1)\n"
+		"   movq 32(%0), %%mm4\n"
+		"   movntq %%mm4, 32(%1)\n"
+		"   movq 40(%0), %%mm5\n"
+		"   movntq %%mm5, 40(%1)\n"
+		"   movq 48(%0), %%mm6\n"
+		"   movntq %%mm6, 48(%1)\n"
+		"   movq 56(%0), %%mm7\n"
+		"   movntq %%mm7, 56(%1)\n"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	/* since movntq is weakly-ordered, a "sfence" is needed to become
+	 * ordered again.
+	 */
+	__asm__ __volatile__ (
+		"  sfence \n" : :
+	);
+	kernel_fpu_end();
+}
+
+#else
+
+/*
+ *	Generic MMX implementation without K7 specific streaming
+ */
+ 
+static void fast_clear_page(void *page)
+{
+	int i;
+	
+	kernel_fpu_begin();
+	
+	__asm__ __volatile__ (
+		"  pxor %%mm0, %%mm0\n" : :
+	);
+
+	for(i=0;i<4096/128;i++)
+	{
+		__asm__ __volatile__ (
+		"  movq %%mm0, (%0)\n"
+		"  movq %%mm0, 8(%0)\n"
+		"  movq %%mm0, 16(%0)\n"
+		"  movq %%mm0, 24(%0)\n"
+		"  movq %%mm0, 32(%0)\n"
+		"  movq %%mm0, 40(%0)\n"
+		"  movq %%mm0, 48(%0)\n"
+		"  movq %%mm0, 56(%0)\n"
+		"  movq %%mm0, 64(%0)\n"
+		"  movq %%mm0, 72(%0)\n"
+		"  movq %%mm0, 80(%0)\n"
+		"  movq %%mm0, 88(%0)\n"
+		"  movq %%mm0, 96(%0)\n"
+		"  movq %%mm0, 104(%0)\n"
+		"  movq %%mm0, 112(%0)\n"
+		"  movq %%mm0, 120(%0)\n"
+		: : "r" (page) : "memory");
+		page+=128;
+	}
+
+	kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
author	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 11:16:33 +0200
committer	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 11:16:33 +0200
commit	44f0257fc316ff4b33aa3438dd8d891b7d6d72b9 (patch)
tree	c1a9a571db37d631489f18e1dfe5554874b19027 /arch/x86/lib
parent	da957e111bb0c189a4a3bf8a00caaecb59ed94ca (diff)