aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2007-10-11 11:17:08 +0200
committerThomas Gleixner <tglx@linutronix.de>2007-10-11 11:17:08 +0200
commit185f3d38900f750a4566f87cde6a178f3595a115 (patch)
treed463f6da1af452b1bbdf476828ea88427087f255 /arch/x86
parent51b2833060f26258ea2da091c7b9c6a358ac9dd2 (diff)
x86_64: move lib
Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/Makefile_6413
-rw-r--r--arch/x86/lib/bitops_64.c175
-rw-r--r--arch/x86/lib/bitstr_64.c28
-rw-r--r--arch/x86/lib/clear_page_64.S59
-rw-r--r--arch/x86/lib/copy_page_64.S119
-rw-r--r--arch/x86/lib/copy_user_64.S354
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S217
-rw-r--r--arch/x86/lib/csum-copy_64.S249
-rw-r--r--arch/x86/lib/csum-partial_64.c150
-rw-r--r--arch/x86/lib/csum-wrappers_64.c135
-rw-r--r--arch/x86/lib/delay_64.c57
-rw-r--r--arch/x86/lib/getuser_64.S109
-rw-r--r--arch/x86/lib/io_64.c23
-rw-r--r--arch/x86/lib/iomap_copy_64.S30
-rw-r--r--arch/x86/lib/memcpy_64.S131
-rw-r--r--arch/x86/lib/memmove_64.c21
-rw-r--r--arch/x86/lib/memset_64.S133
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/thunk_64.S67
-rw-r--r--arch/x86/lib/usercopy_64.c166
22 files changed, 2381 insertions, 1 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 2d7d724a2a6..329da276c6f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -1,5 +1,5 @@
ifeq ($(CONFIG_X86_32),y)
include ${srctree}/arch/x86/lib/Makefile_32
else
-include ${srctree}/arch/x86_64/lib/Makefile_64
+include ${srctree}/arch/x86/lib/Makefile_64
endif
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
new file mode 100644
index 00000000000..bbabad3c933
--- /dev/null
+++ b/arch/x86/lib/Makefile_64
@@ -0,0 +1,13 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial_64.o := -funroll-loops
+
+obj-y := io_64.o iomap_copy_64.o
+obj-$(CONFIG_SMP) += msr-on-cpu.o
+
+lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
+ usercopy_64.o getuser_64.o putuser_64.o \
+ thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
new file mode 100644
index 00000000000..95b6d9639fb
--- /dev/null
+++ b/arch/x86/lib/bitops_64.c
@@ -0,0 +1,175 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+ long d0, d1, d2;
+ long res;
+
+ /*
+ * We must test the size in words, not in bits, because
+ * otherwise incoming sizes in the range -63..-1 will not run
+ * any scasq instructions, and then the flags used by the je
+ * instruction will have whatever random value was in place
+ * before. Nobody should call us like that, but
+ * find_next_zero_bit() does when offset and size are at the
+ * same word and it fails to find a zero itself.
+ */
+ size += 63;
+ size >>= 6;
+ if (!size)
+ return 0;
+ asm volatile(
+ " repe; scasq\n"
+ " je 1f\n"
+ " xorq -8(%%rdi),%%rax\n"
+ " subq $8,%%rdi\n"
+ " bsfq %%rax,%%rdx\n"
+ "1: subq %[addr],%%rdi\n"
+ " shlq $3,%%rdi\n"
+ " addq %%rdi,%%rdx"
+ :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+ :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+ [addr] "S" (addr) : "memory");
+ /*
+ * Any register would do for [addr] above, but GCC tends to
+ * prefer rbx over rsi, even though rsi is readily available
+ * and doesn't have to be saved.
+ */
+ return res;
+}
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+ return __find_first_zero_bit (addr, size);
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+ const unsigned long * p = addr + (offset >> 6);
+ unsigned long set = 0;
+ unsigned long res, bit = offset&63;
+
+ if (bit) {
+ /*
+ * Look for zero in first word
+ */
+ asm("bsfq %1,%0\n\t"
+ "cmoveq %2,%0"
+ : "=r" (set)
+ : "r" (~(*p >> bit)), "r"(64L));
+ if (set < (64 - bit))
+ return set + offset;
+ set = 64 - bit;
+ p++;
+ }
+ /*
+ * No zero yet, search remaining full words for a zero
+ */
+ res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
+ return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+ long d0, d1;
+ long res;
+
+ /*
+ * We must test the size in words, not in bits, because
+ * otherwise incoming sizes in the range -63..-1 will not run
+ * any scasq instructions, and then the flags used by the jz
+ * instruction will have whatever random value was in place
+ * before. Nobody should call us like that, but
+ * find_next_bit() does when offset and size are at the same
+ * word and it fails to find a one itself.
+ */
+ size += 63;
+ size >>= 6;
+ if (!size)
+ return 0;
+ asm volatile(
+ " repe; scasq\n"
+ " jz 1f\n"
+ " subq $8,%%rdi\n"
+ " bsfq (%%rdi),%%rax\n"
+ "1: subq %[addr],%%rdi\n"
+ " shlq $3,%%rdi\n"
+ " addq %%rdi,%%rax"
+ :"=a" (res), "=&c" (d0), "=&D" (d1)
+ :"0" (0ULL), "1" (size), "2" (addr),
+ [addr] "r" (addr) : "memory");
+ return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+ return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+ const unsigned long * p = addr + (offset >> 6);
+ unsigned long set = 0, bit = offset & 63, res;
+
+ if (bit) {
+ /*
+ * Look for nonzero in the first 64 bits:
+ */
+ asm("bsfq %1,%0\n\t"
+ "cmoveq %2,%0\n\t"
+ : "=r" (set)
+ : "r" (*p >> bit), "r" (64L));
+ if (set < (64 - bit))
+ return set + offset;
+ set = 64 - bit;
+ p++;
+ }
+ /*
+ * No set bit yet, search remaining full words for a bit
+ */
+ res = __find_first_bit (p, size - 64 * (p - addr));
+ return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
new file mode 100644
index 00000000000..24676609a6a
--- /dev/null
+++ b/arch/x86/lib/bitstr_64.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */
+unsigned long
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{
+ unsigned long n, end, i;
+
+ again:
+ n = find_next_zero_bit(bitmap, nbits, start);
+ if (n == -1)
+ return -1;
+
+ /* could test bitsliced, but it's hardly worth it */
+ end = n+len;
+ if (end >= nbits)
+ return -1;
+ for (i = n+1; i < end; i++) {
+ if (test_bit(i, bitmap)) {
+ start = i+1;
+ goto again;
+ }
+ }
+ return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
new file mode 100644
index 00000000000..9a10a78bb4a
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,59 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page.
+ * rdi page
+ */
+ ALIGN
+clear_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ xorl %eax,%eax
+ rep stosq
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page)
+
+ENTRY(clear_page)
+ CFI_STARTPROC
+ xorl %eax,%eax
+ movl $4096/64,%ecx
+ .p2align 4
+.Lloop:
+ decl %ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+ movq %rax,(%rdi)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop
+ nop
+ ret
+ CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
+
+ /* Some CPUs run faster using the string instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lclear_page_end - clear_page
+ .byte 2b - 1b
+ .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 00000000000..727a5d46d2f
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+ ALIGN
+copy_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ rep movsq
+ ret
+ CFI_ENDPROC
+ENDPROC(copy_page_c)
+
+/* Don't use streaming store because it's better when the target
+ ends up in cache. */
+
+/* Could vary the prefetch distance based on SMP/UP */
+
+ENTRY(copy_page)
+ CFI_STARTPROC
+ subq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 3*8
+ movq %rbx,(%rsp)
+ CFI_REL_OFFSET rbx, 0
+ movq %r12,1*8(%rsp)
+ CFI_REL_OFFSET r12, 1*8
+ movq %r13,2*8(%rsp)
+ CFI_REL_OFFSET r13, 2*8
+
+ movl $(4096/64)-5,%ecx
+ .p2align 4
+.Loop64:
+ dec %rcx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ prefetcht0 5*64(%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
+
+ jnz .Loop64
+
+ movl $5,%ecx
+ .p2align 4
+.Loop2:
+ decl %ecx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+
+ jnz .Loop2
+
+ movq (%rsp),%rbx
+ CFI_RESTORE rbx
+ movq 1*8(%rsp),%r12
+ CFI_RESTORE r12
+ movq 2*8(%rsp),%r13
+ CFI_RESTORE r13
+ addq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -3*8
+ ret
+.Lcopy_page_end:
+ CFI_ENDPROC
+ENDPROC(copy_page)
+
+ /* Some CPUs run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lcopy_page_end - copy_page
+ .byte 2b - 1b
+ .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 00000000000..70bebd31040
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+ .macro ALTERNATIVE_JUMP feature,orig,alt
+0:
+ .byte 0xe9 /* 32bit jump */
+ .long \orig-1f /* by default jump to orig */
+1:
+ .section .altinstr_replacement,"ax"
+2: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt-1b /* offset */ /* or alternatively to alt */
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad 0b
+ .quad 2b
+ .byte \feature /* when feature is set */
+ .byte 5
+ .byte 5
+ .previous
+ .endm
+
+/* Standard copy_to_user with segment limit checking */
+ENTRY(copy_to_user)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%rax)
+ movq %rdi,%rcx
+ addq %rdx,%rcx
+ jc bad_to_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_to_user
+ xorl %eax,%eax /* clear zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+ENTRY(copy_user_generic)
+ CFI_STARTPROC
+ movl $1,%ecx /* set zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+ENTRY(__copy_from_user_inatomic)
+ CFI_STARTPROC
+ xorl %ecx,%ecx /* clear zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+/* Standard copy_from_user with segment limit checking */
+ENTRY(copy_from_user)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%rax)
+ movq %rsi,%rcx
+ addq %rdx,%rcx
+ jc bad_from_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_from_user
+ movl $1,%ecx /* set zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+ENDPROC(copy_from_user)
+
+ .section .fixup,"ax"
+ /* must zero dest */
+bad_from_user:
+ CFI_STARTPROC
+ movl %edx,%ecx
+ xorl %eax,%eax
+ rep
+ stosb
+bad_to_user:
+ movl %edx,%eax
+ ret
+ CFI_ENDPROC
+END(bad_from_user)
+ .previous
+
+
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag -- if true zero destination on error
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+ xorl %eax,%eax /*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movq %r11,(%rdi)
+.Ld2: movq %r8,1*8(%rdi)
+.Ld3: movq %r9,2*8(%rdi)
+.Ld4: movq %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movq %r11,4*8(%rdi)
+.Ld6: movq %r8,5*8(%rdi)
+.Ld7: movq %r9,6*8(%rdi)
+.Ld8: movq %r10,7*8(%rdi)
+
+ decq %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movq %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp)
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(copy_user_generic)
+
+
+ /* Some CPUs run faster using the string copy instructions.
+ This is also a lot simpler. Use them when possible.
+ Patch in jmps to this code instead of copying it fully
+ to avoid unwanted aliasing in the exception tables. */
+
+ /* rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successfull.
+ *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ */
+ENTRY(copy_user_generic_string)
+ CFI_STARTPROC
+ movl %ecx,%r8d /* save zero flag */
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+ jz 10f
+1: rep
+ movsq
+ movl %edx,%ecx
+2: rep
+ movsb
+9: movl %ecx,%eax
+ ret
+
+ /* multiple of 8 byte */
+10: rep
+ movsq
+ xor %eax,%eax
+ ret
+
+ /* exception handling */
+3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
+ jmp 6f
+5: movl %ecx,%eax /* exception on byte loop */
+ /* eax: left over bytes */
+6: testl %r8d,%r8d /* zero flag set? */
+ jz 7f
+ movl %eax,%ecx /* initialize x86 loop counter */
+ push %rax
+ xorl %eax,%eax
+8: rep
+ stosb /* zero the rest */
+11: pop %rax
+7: ret
+ CFI_ENDPROC
+END(copy_user_generic_c)
+
+ .section __ex_table,"a"
+ .quad 1b,3b
+ .quad 2b,5b
+ .quad 8b,11b
+ .quad 10b,3b
+ .previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
new file mode 100644
index 00000000000..4620efb12f1
--- /dev/null
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx /* save zero flag */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+
+ xorl %eax,%eax /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movnti %r11,(%rdi)
+.Ld2: movnti %r8,1*8(%rdi)
+.Ld3: movnti %r9,2*8(%rdi)
+.Ld4: movnti %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movnti %r11,4*8(%rdi)
+.Ld6: movnti %r8,5*8(%rdi)
+.Ld7: movnti %r9,6*8(%rdi)
+.Ld8: movnti %r10,7*8(%rdi)
+
+ dec %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movnti %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE %rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp) /* zero flag set? */
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
new file mode 100644
index 00000000000..f0dba36578e
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
+ * destination is zeroed.
+ *
+ * Input
+ * rdi source
+ * rsi destination
+ * edx len (32bit)
+ * ecx sum (32bit)
+ * r8 src_err_ptr (int)
+ * r9 dst_err_ptr (int)
+ *
+ * Output
+ * eax 64bit sum. undefined in case of exception.
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
+ * They also should align source or destination to 8 bytes.
+ */
+
+ .macro source
+10:
+ .section __ex_table,"a"
+ .align 8
+ .quad 10b,.Lbad_source
+ .previous
+ .endm
+
+ .macro dest
+20:
+ .section __ex_table,"a"
+ .align 8
+ .quad 20b,.Lbad_dest
+ .previous
+ .endm
+
+ .macro ignore L=.Lignore
+30:
+ .section __ex_table,"a"
+ .align 8
+ .quad 30b,\L
+ .previous
+ .endm
+
+
+ENTRY(csum_partial_copy_generic)
+ CFI_STARTPROC
+ cmpl $3*64,%edx
+ jle .Lignore
+
+.Lignore:
+ subq $7*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 7*8
+ movq %rbx,2*8(%rsp)
+ CFI_REL_OFFSET rbx, 2*8
+ movq %r12,3*8(%rsp)
+ CFI_REL_OFFSET r12, 3*8
+ movq %r14,4*8(%rsp)
+ CFI_REL_OFFSET r14, 4*8
+ movq %r13,5*8(%rsp)
+ CFI_REL_OFFSET r13, 5*8
+ movq %rbp,6*8(%rsp)
+ CFI_REL_OFFSET rbp, 6*8
+
+ movq %r8,(%rsp)
+ movq %r9,1*8(%rsp)
+
+ movl %ecx,%eax
+ movl %edx,%ecx
+
+ xorl %r9d,%r9d
+ movq %rcx,%r12
+
+ shrq $6,%r12
+ jz .Lhandle_tail /* < 64 */
+
+ clc
+
+ /