30 files changed, 1518 insertions, 897 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 93122d5b155..c4211cbb202 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -2,17 +2,17 @@
 # Makefile for TILE-specific library files..
 #
 
-lib-y = cacheflush.o checksum.o cpumask.o delay.o \
-	mb_incoherent.o uaccess.o memmove.o \
-	memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
-	strchr_$(BITS).o strlen_$(BITS).o
-
-ifeq ($(CONFIG_TILEGX),y)
-lib-y += memcpy_user_64.o
-else
-lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
-endif
+lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
+	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
+	strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
 
+lib-$(CONFIG_TILEGX) += memcpy_user_64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
 lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
 
 obj-$(CONFIG_MODULES) += exports.o
+
+# The finv_buffer_remote() and copy_{to,from}_user() routines can't
+# have -pg added, since they both rely on being leaf functions.
+CFLAGS_REMOVE_cacheflush.o = -pg
+CFLAGS_REMOVE_memcpy_user_64.o = -pg
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 7a5cc706ab6..c89b211fd9e 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -17,55 +17,15 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <asm/atomic.h>
-#include <asm/futex.h>
+#include <linux/atomic.h>
 #include <arch/chip.h>
 
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
-	int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
-	__write_once = {
-	[0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* This page is remapped on startup to be hash-for-home. */
-int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
-  __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
+int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
 
-static inline int *__atomic_hashed_lock(volatile void *v)
+int *__atomic_hashed_lock(volatile void *v)
 {
-	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	unsigned long i =
-		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
-	unsigned long n = __insn_crc32_32(0, i);
-
-	/* Grab high bits for L1 index. */
-	unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
-	/* Grab low bits for L2 index. */
-	unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
-	return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
+	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
 	/*
 	 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
 	 * Using mm works here because atomic_locks is page aligned.
@@ -74,26 +34,13 @@ static inline int *__atomic_hashed_lock(volatile void *v)
 				      (unsigned long)atomic_locks,
 				      2, (ATOMIC_HASH_SHIFT + 2) - 1);
 	return (int *)ptr;
-#endif
 }
 
 #ifdef CONFIG_SMP
 /* Return whether the passed pointer is a valid atomic lock pointer. */
 static int is_atomic_lock(int *p)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	int i;
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
-		if (p >= &atomic_lock_ptr[i]->lock[0] &&
-		    p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
-			return 1;
-		}
-	}
-	return 0;
-#else
 	return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
 }
 
 void __atomic_fault_unlock(int *irqlock_word)
@@ -112,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)
 	return __atomic_hashed_lock(v);
 }
 
-int _atomic_xchg(atomic_t *v, int n)
+int _atomic_xchg(int *v, int n)
 {
-	return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
+	return __atomic_xchg(v, __atomic_setup(v), n).val;
 }
 EXPORT_SYMBOL(_atomic_xchg);
 
-int _atomic_xchg_add(atomic_t *v, int i)
+int _atomic_xchg_add(int *v, int i)
 {
-	return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
+	return __atomic_xchg_add(v, __atomic_setup(v), i).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add);
 
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
+int _atomic_xchg_add_unless(int *v, int a, int u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
-		.val;
+	return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add_unless);
 
-int _atomic_cmpxchg(atomic_t *v, int o, int n)
+int _atomic_cmpxchg(int *v, int o, int n)
 {
-	return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
+	return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
 }
 EXPORT_SYMBOL(_atomic_cmpxchg);
 
@@ -161,78 +107,36 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
 EXPORT_SYMBOL(_atomic_xor);
 
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n)
+long long _atomic64_xchg(long long *v, long long n)
 {
-	return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
+	return __atomic64_xchg(v, __atomic_setup(v), n);
 }
 EXPORT_SYMBOL(_atomic64_xchg);
 
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
+long long _atomic64_xchg_add(long long *v, long long i)
 {
-	return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
+	return __atomic64_xchg_add(v, __atomic_setup(v), i);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add);
 
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
+long long _atomic64_xchg_add_unless(long long *v, long long a, long long u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
-					  u, a);
+	return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add_unless);
 
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+long long _atomic64_cmpxchg(long long *v, long long o, long long n)
 {
-	return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
+	return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
 }
 EXPORT_SYMBOL(_atomic64_cmpxchg);
 
 
-static inline int *__futex_setup(int __user *v)
-{
-	/*
-	 * Issue a prefetch to the counter to bring it into cache.
-	 * As for __atomic_setup, but we can't do a read into the L1
-	 * since it might fault; instead we do a prefetch into the L2.
-	 */
-	__insn_prefetch(v);
-	return __atomic_hashed_lock((int __force *)v);
-}
-
-struct __get_user futex_set(int __user *v, int i)
-{
-	return __atomic_xchg((int __force *)v, __futex_setup(v), i);
-}
-
-struct __get_user futex_add(int __user *v, int n)
-{
-	return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_or(int __user *v, int n)
-{
-	return __atomic_or((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_andn(int __user *v, int n)
-{
-	return __atomic_andn((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_xor(int __user *v, int n)
-{
-	return __atomic_xor((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_cmpxchg(int __user *v, int o, int n)
-{
-	return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
-}
-
 /*
  * If any of the atomic or futex routines hit a bad address (not in
  * the page tables at kernel PL) this routine is called.  The futex
@@ -251,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)
 }
 
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static int __init noatomichash(char *str)
-{
-	pr_warning("noatomichash is deprecated.\n");
-	return 1;
-}
-__setup("noatomichash", noatomichash);
-#endif
-
 void __init __init_atomic_per_cpu(void)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-	unsigned int i;
-	int actual_cpu;
-
-	/*
-	 * Before this is called from setup, we just have one lock for
-	 * all atomic objects/operations.  Here we replace the
-	 * elements of atomic_lock_ptr so that they point at per_cpu
-	 * integers.  This seemingly over-complex approach stems from
-	 * the fact that DEFINE_PER_CPU defines an entry for each cpu
-	 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1.  But
-	 * for efficient hashing of atomics to their locks we want a
-	 * compile time constant power of 2 for the size of this
-	 * table, so we use ATOMIC_HASH_SIZE.
-	 *
-	 * Here we populate atomic_lock_ptr from the per cpu
-	 * atomic_lock_pool, interspersing by actual cpu so that
-	 * subsequent elements are homed on consecutive cpus.
-	 */
-
-	actual_cpu = cpumask_first(cpu_possible_mask);
-
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-		/*
-		 * Preincrement to slightly bias against using cpu 0,
-		 * which has plenty of stuff homed on it already.
-		 */
-		actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
-		if (actual_cpu >= nr_cpu_ids)
-			actual_cpu = cpumask_first(cpu_possible_mask);
-
-		atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 	/* Validate power-of-two and "bigger than cpus" assumption */
 	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
 	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -322,9 +180,4 @@ void __init __init_atomic_per_cpu(void)
 	 * That should not produce more indices than ATOMIC_HASH_SIZE.
 	 */
 	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
-	/* The futex code makes this assumption, so we validate it here. */
-	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 }
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 5a5514b77e7..6bda3132cd6 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
  * Support routines for atomic operations.  Each function takes:
  *
  * r0: address to manipulate
- * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
  * r2: new value to write, or for cmpxchg/add_unless, value to compare against
  * r3: (cmpxchg/xchg_add_unless) new value to write or add;
  *     (atomic64 ops) high word of value to write
@@ -59,7 +59,7 @@
  * bad kernel addresses).
  *
  * Note that if the value we would store is the same as what we
- * loaded, we bypass the load.  Other platforms with true atomics can
+ * loaded, we bypass the store.  Other platforms with true atomics can
  * make the guarantee that a non-atomic __clear_bit(), for example,
  * can safely race with an atomic test_and_set_bit(); this example is
  * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do
@@ -70,7 +70,7 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/atomic.h>
+#include <asm/atomic_32.h>
 #include <asm/page.h>
 #include <asm/processor.h>
 
@@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)
 	STD_ENDPROC(__atomic\name)
 	.ifc \bitwidth,32
 	.pushsection __ex_table,"a"
+	.align  4
 	.word   1b, __atomic\name
 	.word   2b, __atomic\name
 	.word   __atomic\name, __atomic_bad_address
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 11b6164c209..9c0ec22009a 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -12,12 +12,162 @@
  *   more details.
  */
 
+#include <linux/export.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
 #include <arch/icache.h>
+#include <arch/spr_def.h>
 
 
 void __flush_icache_range(unsigned long start, unsigned long end)
 {
 	invalidate_icache((const void *)start, end - start, PAGE_SIZE);
 }
+
+
+/* Force a load instruction to issue. */
+static inline void force_load(char *p)
+{
+	*(volatile char *)p;
+}
+
+/*
+ * Flush and invalidate a VA range that is homed remotely on a single
+ * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
+ * until the memory controller holds the flushed values.
+ */
+void __attribute__((optimize("omit-frame-pointer")))
+finv_buffer_remote(void *buffer, size_t size, int hfh)
+{
+	char *p, *base;
+	size_t step_size, load_count;
+
+	/*
+	 * On TILEPro the striping granularity is a fixed 8KB; on
+	 * TILE-Gx it is configurable, and we rely on the fact that
+	 * the hypervisor always configures maximum striping, so that
+	 * bits 9 and 10 of the PA are part of the stripe function, so
+	 * every 512 bytes we hit a striping boundary.
+	 *
+	 */
+#ifdef __tilegx__
+	const unsigned long STRIPE_WIDTH = 512;
+#else
+	const unsigned long STRIPE_WIDTH = 8192;
+#endif
+
+#ifdef __tilegx__
+	/*
+	 * On TILE-Gx, we must disable the dstream prefetcher before doing
+	 * a cache flush; otherwise, we could end up with data in the cache
+	 * that we don't want there.  Note that normally we'd do an mf
+	 * after the SPR write to disabling the prefetcher, but we do one
+	 * below, before any further loads, so there's no need to do it
+	 * here.
+	 */
+	uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
+	__insn_mtspr(SPR_DSTREAM_PF, 0);
+#endif
+
+	/*
+	 * Flush and invalidate the buffer out of the local L1/L2
+	 * and request the home cache to flush and invalidate as well.
+	 */
+	__finv_buffer(buffer, size);
+
+	/*
+	 * Wait for the home cache to acknowledge that it has processed
+	 * all the flush-and-invalidate requests.  This does not mean
+	 * that the flushed data has reached the memory controller yet,
+	 * but it does mean the home cache is processing the flushes.
+	 */
+	__insn_mf();
+
+	/*
+	 * Issue a load to the last cache line, which can't complete
+	 * until all the previously-issued flushes to the same memory
+	 * controller have also completed.  If we weren't striping
+	 * memory, that one load would be sufficient, but since we may
+	 * be, we also need to back up to the last load issued to
+	 * another memory controller, which would be the point where
+	 * we crossed a "striping" boundary (the granularity of striping
+	 * across memory controllers).  Keep backing up and doing this
+	 * until we are before the beginning of the buffer, or have
+	 * hit all the controllers.
+	 *
+	 * If we are flushing a hash-for-home buffer, it's even worse.
+	 * Each line may be homed on a different tile, and each tile
+	 * may have up to four lines that are on different
+	 * controllers.  So as we walk backwards, we have to touch
+	 * enough cache lines to satisfy these constraints.  In
+	 * practice this ends up being close enough to "load from
+	 * every cache line on a full memory stripe on each
+	 * controller" that we simply do that, to simplify the logic.
+	 *
+	 * On TILE-Gx the hash-for-home function is much more complex,
+	 * with the upshot being we can't readily guarantee we have
+	 * hit both entries in the 128-entry AMT that were hit by any
+	 * load in the entire range, so we just re-load them all.
+	 * With larger buffers, we may want to consider using a hypervisor
+	 * trap to issue loads directly to each hash-for-home tile for
+	 * each controller (doing it from Linux would trash the TLB).
+	 */
+	if (hfh) {
+		step_size = L2_CACHE_BYTES;
+#ifdef __tilegx__
+		load_count = (size + L2_CACHE_BYTES - 1) / L2_CACHE_BYTES;
+#else
+		load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
+			      (1 << CHIP_LOG_NUM_MSHIMS());
+#endif
+	} else {
+		step_size = STRIPE_WIDTH;
+		load_count = (1 << CHIP_LOG_NUM_MSHIMS());
+	}
+
+	/* Load the last byte of the buffer. */
+	p = (char *)buffer + size - 1;
+	force_load(p);
+
+	/* Bump down to the end of the previous stripe or cache line. */
+	p -= step_size;
+	p = (char *)((unsigned long)p | (step_size - 1));
+
+	/* Figure out how far back we need to go. */
+	base = p - (step_size * (load_count - 2));
+	if ((unsigned long)base < (unsigned long)buffer)
+		base = buffer;
+
+	/*
+	 * Fire all the loads we need.  The MAF only has eight entries
+	 * so we can have at most eight outstanding loads, so we
+	 * unroll by that amount.
+	 */
+#pragma unroll 8
+	for (; p >= base; p -= step_size)
+		force_load(p);
+
+	/*
+	 * Repeat, but with finv's instead of loads, to get rid of the
+	 * data we just loaded into our own cache and the old home L3.
+	 * No need to unroll since finv's don't target a register.
+	 * The finv's are guaranteed not to actually flush the data in
+	 * the buffer back to their home, since we just read it, so the
+	 * lines are clean in cache; we will only invalidate those lines.
+	 */
+	p = (char *)buffer + size - 1;
+	__insn_finv(p);
+	p -= step_size;
+	p = (char *)((unsigned long)p | (step_size - 1));
+	for (; p >= base; p -= step_size)
+		__insn_finv(p);
+
+	/* Wait for these finv's (and thus the first finvs) to be done. */
+	__insn_mf();
+
+#ifdef __tilegx__
+	/* Reenable the prefetcher. */
+	__insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
+#endif
+}
+EXPORT_SYMBOL_GPL(finv_buffer_remote);
diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c
index e4bab5bd3f3..c3ca3e64d9d 100644
--- a/arch/tile/lib/checksum.c
+++ b/arch/tile/lib/checksum.c
@@ -16,19 +16,6 @@
 #include <net/checksum.h>
 #include <linux/module.h>
 
-static inline unsigned int longto16(unsigned long x)
-{
-	unsigned long ret;
-#ifdef __tilegx__
-	ret = __insn_v2sadu(x, 0);
-	ret = __insn_v2sadu(ret, 0);
-#else
-	ret = __insn_sadh_u(x, 0);
-	ret = __insn_sadh_u(ret, 0);
-#endif
-	return ret;
-}
-
 __wsum do_csum(const unsigned char *buff, int len)
 {
 	int odd, count;
@@ -94,7 +81,7 @@ __wsum do_csum(const unsigned char *buff, int len)
 	}
 	if (len & 1)
 		result += *buff;
-	result = longto16(result);
+	result = csum_long(result);
 	if (odd)
 		result = swab16(result);
 out:
diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c
index fdc403614d1..75947edccb2 100644
--- a/arch/tile/lib/cpumask.c
+++ b/arch/tile/lib/cpumask.c
@@ -16,6 +16,7 @@
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/smp.h>
+#include <linux/export.h>
 
 /*
  * Allow cropping out bits beyond the end of the array.
@@ -50,3 +51,4 @@ int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits)
 	} while (*bp != '\0' && *bp != '\n');
 	return 0;
 }
+EXPORT_SYMBOL(bitmap_parselist_crop);
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
index 5801b03c13e..cdacdd11d36 100644
--- a/arch/tile/lib/delay.c
+++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/thread_info.h>
-#include <asm/fixmap.h>
-#include <hv/hypervisor.h>
+#include <asm/timex.h>
 
 void __udelay(unsigned long usecs)
 {
-	hv_nanosleep(usecs * 1000);
+	if (usecs > ULONG_MAX / 1000) {
+		WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
+		usecs = ULONG_MAX / 1000;
+	}
+	__ndelay(usecs * 1000);
 }
 EXPORT_SYMBOL(__udelay);
 
 void __ndelay(unsigned long nsecs)
 {
-	hv_nanosleep(nsecs);
+	cycles_t target = get_cycles();
+	target += ns2cycles(nsecs);
+	while (get_cycles() < target)
+		cpu_relax();
 }
 EXPORT_SYMBOL(__ndelay);
 
-/* FIXME: should be declared in a header somewhere. */
+void __delay(unsigned long cycles)
+{
+	cycles_t target = get_cycles() + cycles;
+	while (get_cycles() < target)
+		cpu_relax();
+}
 EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 1509c559765..82733c87d67 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -18,17 +18,11 @@
 
 /* arch/tile/lib/usercopy.S */
 #include <linux/uaccess.h>
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
+EXPORT_SYMBOL(flush_user_asm);
+EXPORT_SYMBOL(finv_user_asm);
 
 /* arch/tile/kernel/entry.S */
 #include <linux/kernel.h>
@@ -36,6 +30,15 @@ EXPORT_SYMBOL(clear_user_asm);
 EXPORT_SYMBOL(current_text_addr);
 EXPORT_SYMBOL(dump_stack);
 
+/* arch/tile/kernel/head.S */
+EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_FUNCTION_TRACER
+/* arch/tile/kernel/mcount_64.S */
+#include <asm/ftrace.h>
+EXPORT_SYMBOL(__mcount);
+#endif /* CONFIG_FUNCTION_TRACER */
+
 /* arch/tile/lib/, various memcpy files */
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__copy_to_user_inatomic);
@@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
 EXPORT_SYMBOL(__copy_in_user_inatomic);
 #endif
 
-/* arch/tile/lib/mb_incoherent.S */
-EXPORT_SYMBOL(__mb_incoherent);
-
 /* hypervisor glue */
 #include <hv/hypervisor.h>
 EXPORT_SYMBOL(hv_dev_open);
@@ -60,6 +60,8 @@ EXPORT_SYMBOL(hv_dev_poll_cancel);
 EXPORT_SYMBOL(hv_dev_close);
 EXPORT_SYMBOL(hv_sysconf);
 EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_get_rtc);
+EXPORT_SYMBOL(hv_set_rtc);
 
 /* libgcc.a */
 uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
@@ -79,10 +81,14 @@ EXPORT_SYMBOL(__umoddi3);
 int64_t __moddi3(int64_t dividend, int64_t divisor);
 EXPORT_SYMBOL(__moddi3);
 #ifndef __tilegx__
-uint64_t __ll_mul(uint64_t n0, uint64_t n1);
-EXPORT_SYMBOL(__ll_mul);
 int64_t __muldi3(int64_t, int64_t);
 EXPORT_SYMBOL(__muldi3);
 uint64_t __lshrdi3(uint64_t, unsigned int);
 EXPORT_SYMBOL(__lshrdi3);
+uint64_t __ashrdi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashrdi3);
+uint64_t __ashldi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashldi3);
+int __ffsdi2(uint64_t);
+EXPORT_SYMBOL(__ffsdi2);
 #endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
deleted file mode 100644
index 989ad7b68d5..00000000000
--- a/arch/tile/lib/mb_incoherent.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- *
- * Assembly code for invoking the HV's fence_incoherent syscall.
- */
-
-#include <linux/linkage.h>
-#include <hv/syscall_public.h>
-#include <arch/abi.h>
-#include <arch/chip.h>
-
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-
-/*
- * Invoke the hypervisor's fence_incoherent syscall, which guarantees
- * that all victims for cachelines homed on this tile have reached memory.
- */
-STD_ENTRY(__mb_incoherent)
-	moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
-	swint2
-	jrp lr
-	STD_ENDPROC(__mb_incoherent)
-
-#endif
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c
index 6235283b485..cc3d9badf03 100644
--- a/arch/tile/lib/memchr_32.c
+++ b/arch/tile/lib/memchr_32.c
@@ -18,12 +18,24 @@
 
 void *memchr(const void *s, int c, size_t n)
 {
+	const uint32_t *last_word_ptr;
+	const uint32_t *p;
+	const char *last_byte_ptr;
+	uintptr_t s_int;
+	uint32_t goal, before_mask, v, bits;
+	char *ret;
+
+	if (__builtin_expect(n == 0, 0)) {
+		/* Don't dereference any memory if the array is empty. */
+		return NULL;
+	}
+
 	/* Get an aligned pointer. */
-	const uintptr_t s_int = (uintptr_t) s;
-	const uint32_t *p = (const uint32_t *)(s_int & -4);
+	s_int = (uintptr_t) s;
+	p = (const uint32_t *)(s_int & -4);
 
 	/* Create four copies of the byte for which we are looking. */
-	const uint32_t goal = 0x01010101 * (uint8_t) c;
+	goal = 0x01010101 * (uint8_t) c;
 
 	/* Read the first word, but munge it so that bytes before the array
 	 * will not match goal.
@@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n)
 	 * Note that this shift count expression works because we know
 	 * shift counts are taken mod 32.
 	 */
-	const uint32_t before_mask = (1 << (s_int << 3)) - 1;
-	uint32_t v = (*p | before_mask) ^ (goal & before_mask);
+	before_mask = (1 << (s_int << 3)) - 1;
+	v = (*p | before_mask) ^ (goal & before_mask);
 
 	/* Compute the address of the last byte. */
-	const char *const last_byte_ptr = (const char *)s + n - 1;
+	last_byte_ptr = (const char *)s + n - 1;
 
 	/* Compute the address of the word containing the last byte. */
-	const uint32_t *const last_word_ptr =
-	    (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
-
-	uint32_t bits;
-	char *ret;
-
-	if (__builtin_expect(n == 0, 0)) {
-		/* Don't dereference any memory if the array is empty. */
-		return NULL;
-	}
+	last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
 
 	while ((bits = __insn_seqb(v, goal)) == 0) {
 		if (__builtin_expect(p == last_word_ptr, 0)) {
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
new file mode 100644
index 00000000000..f8196b3a950
--- /dev/null
+++ b/arch/tile/lib/memchr_64.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+void *memchr(const void *s, int c, size_t n)
+{
+	const uint64_t *last_word_ptr;
+	const uint64_t *p;
+	const char *last_byte_ptr;
+	uintptr_t s_int;
+	uint64_t goal, before_mask, v, bits;
+	char *ret;
+
+	if (__builtin_expect(n == 0, 0)) {
+		/* Don't dereference any memory if the array is empty. */
+		return NULL;
+	}
+
+	/* Get an aligned pointer. */
+	s_int = (uintptr_t) s;
+	p = (const uint64_t *)(s_int & -8);
+
+	/* Create eight copies of the byte for which we are looking. */
+	goal = copy_byte(c);
+
+	/* Read the first word, but munge it so that bytes before the array
+	 * will not match goal.
+	 */
+	before_mask = MASK(s_int);
+	v = (*p | before_mask) ^ (goal & before_mask);
+
+	/* Compute the address of the last byte. */
+	last_byte_ptr = (const char *)s + n - 1;
+
+	/* Compute the address of the word containing the last byte. */
+	last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
+
+	while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
+		if (__builtin_expect(p == last_word_ptr, 0)) {
+			/* We already read the last word in the array,
+			 * so give up.
+			 */
+			return NULL;
+		}
+		v = *++p;
+	}
+
+	/* We found a match, but it might be in a byte past the end
+	 * of the array.
+	 */
+	ret = ((char *)p) + (CFZ(bits) >> 3);
+	return (ret <= last_byte_ptr) ? ret : NULL;
+}
+EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 2a419a6122d..a2771ae5da5 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
 
 #include <linux/linkage.h>
 
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
 #define IS_MEMCPY	  0
 #define IS_COPY_FROM_USER  1
 #define IS_COPY_FROM_USER_ZEROING  2
@@ -44,6 +36,7 @@
  */
 #define EX \
 	.pushsection __ex_table, "a"; \
+	.align 4; \
 	.word 9f, memcpy_common_fixup; \
 	.popsection; \
 	9
@@ -158,12 +151,9 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
-#if CHIP_HAS_WH64()
 	/* No need to prefetch dst, we'll just do the wh64
 	 * right before we copy a line.
 	 */
-#endif
-
 EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, .; move r27, lr }
@@ -171,21 +161,6 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
-	/* Prefetch the dest */
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	/* Use a real load to cause a TLB miss if necessary.  We aren't using
-	 * r28, so this should be fine.
-	 */
-EX:	{ lw r28, r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bz zero, .Lbig_loop2 }
 
@@ -286,13 +261,8 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }
 	/* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
-#if CHIP_HAS_WH64()
 	/* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
-#else
-	/* Prefetch dest line */
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
 
 	/* Load the three remaining words from the last L1D line, which
@@ -330,16 +300,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
-	/* Back up the r9 to a cache line we are already storing to
-	 * if it gets past the end of the dest vector.  Strictly speaking,
-	 * we don't need to back up to the start of a cache line, but it's free
-	 * and tidy, so why not?
-	 */
-EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
 	/* Store second L1D line. */
 EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
@@ -403,7 +364,6 @@ EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 
 .Ldest_is_word_aligned:
 
-#if CHIP_HAS_DWORD_ALIGN()
 EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}
 	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
 
@@ -511,26 +471,6 @@ EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }
 	/* Move r1 back to the point where it corresponds to r0. */
 	{ addi r1, r1, -4 }
 
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
-	/* Compute right/left shift counts and load initial source words. */
-	{ andi r5, r1, -4; andi r3, r1, 3 }
-EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
-	/* Load and store one word at a time, using shifts and ORs
-	 * to correct for the misaligned src.
-	 */
-.Lcopy_unaligned_src_loop:
-	{ shr r6, r6, r3; shl r8, r7, r4 }
-EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
-	{ addi r5, r5, 4; slti_u r8, r2, 8 }
-	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
-	{ bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
 	/* Fall through */
 
 /*
@@ -614,5 +554,6 @@ memcpy_fixup_loop:
 	.size memcpy_common_fixup, . - memcpy_common_fixup
 
 	.section __ex_table,"a"
+	.align 4
 	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
 	.word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
new file mode 100644
index 00000000000..4815354b8cd
--- /dev/null
+++ b/arch/tile/lib/memcpy_64.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
+
+/* Must be 8 bytes in size. */
+#define op_t uint64_t
+
+/* Threshold value for when to enter the unrolled loops. */
+#define	OP_T_THRES	16
+
+#if CHIP_L2_LINE_SIZE() != 64
+#error "Assumes 64 byte line size"
+#endif
+
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 4
+
+/*
+ * Provide "base versions" of load and store for the normal code path.
+ * The kernel provides other versions for userspace copies.
+ */
+#define ST(p, v) (*(p) = (v))
+#define LD(p) (*(p))
+
+#ifndef USERCOPY_FUNC
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#define RETVAL dstv
+void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#else
+/*
+ * Special kernel version will provide implementation of the LDn/STn
+ * macros to return a count of uncopied bytes due to mm fault.
+ */
+#define RETVAL 0
+int __attribute__((optimize("omit-frame-pointer")))
+USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#endif
+{
+	char *__restrict dst1 = (char *)dstv;
+	const char *__restrict src1 = (const char *)srcv;
+	const char *__restrict src1_end;
+	const char *__restrict prefetch;
+	op_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+	op_t final; /* Final bytes to write to trailing word, if any */
+	long i;
+
+	if (n < 16) {
+		for (; n; n--)
+			ST1(dst1++, LD1(src1++));
+		return RETVAL;
+	}
+
+	/*
+	 * Locate the end of source memory we will copy.  Don't
+	 * prefetch past this.
+	 */
+	src1_end = src1 + n - 1;
+
+	/* Prefetch ahead a few cache lines, but not past the end. */
+	prefetch = src1;
+	for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
+		__insn_prefetch(prefetch);
+		prefetch += CHIP_L2_LINE_SIZE();
+		prefetch = (prefetch < src1_end) ? prefetch : src1;
+	}
+
+	/* Copy bytes until dst is word-aligned. */
+	for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
+		ST1(dst1++, LD1(src1++));
+
+	/* 8-byte pointer to destination memory. */
+	dst8 = (op_t *)dst1;
+
+	if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
+		/* Unaligned copy. */
+
+		op_t  tmp0 = 0, tmp1 = 0, tmp2, tmp3;
+		const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
+						   -sizeof(op_t));
+		const void *srci = (void *)src1;
+		int m;
+
+		m = (CHIP_L2_LINE_SIZE() << 2) -
+			(((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
+		m = (n < m) ? n : m;
+		m /= sizeof(op_t);
+
+		/* Copy until 'dst' is cache-line-aligned. */
+		n -= (sizeof(op_t) * m);
+
+		switch (m % 4) {
+		case 0:
+			if (__builtin_expect(!m, 0))
+				goto _M0;
+			tmp1 = LD8(src8++);
+			tmp2 = LD8(src8++);
+			goto _8B3;
+		case 2:
+			m += 2;
+			tmp3 = LD8(src8++);
+			tmp0 = LD8(src8++);
+			goto _8B1;
+		case 3:
+			m += 1;
+			tmp2 = LD8(src8++);
+			tmp3 = LD8(src8++);
+			goto _8B2;
+		case 1:
+			m--;
+			tmp0 = LD8(src8++);
+			tmp1 = LD8(src8++);
+			if (__builtin_expect(!m, 0))
+				goto _8B0;
+		}
+
+		do {
+			tmp2 = LD8(src8++);
+			tmp0 =  __insn_dblalign(tmp0, tmp1, srci);
+			ST8(dst8++, tmp0);
+_8B3:
+			tmp3 = LD8(src8++);
+			tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+			ST8(dst8++, tmp1);
+_8B2:
+			tmp0 = LD8(src8++);
+			tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+			ST8(dst8++, tmp2);
+_8B1:
+			tmp1 = LD8(src8++);
+			tmp3 = __insn_dblalign(tmp3, tmp0, srci);
+			ST8(dst8++, tmp3);
+			m -= 4;
+		} while (m);
+
+_8B0:
+		tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+		ST8(dst8++, tmp0);
+		src8--;
+
+_M0:
+		if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
+			op_t tmp4, tmp5, tmp6, tmp7, tmp8;
+
+			prefetch = ((const char *)src8) +
+				CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
+
+			for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
+			     n -= CHIP_L2_LINE_SIZE()) {
+				/* Prefetch and advance to next line to
+				   prefetch, but don't go past the end.  */
+				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
+				prefetch += CHIP_L2_LINE_SIZE();
+				prefetch = (prefetch < src1_end) ? prefetch :
+					(const char *) src8;
+
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+				tmp8 = LD8(src8++);
+
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+				tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+				tmp3 = __insn_dblalign(tmp3, tmp4, srci);
+				tmp4 = __insn_dblalign(tmp4, tmp5, srci);
+				tmp5 = __insn_dblalign(tmp5, tmp6, srci);
+				tmp6 = __insn_dblalign(tmp6, tmp7, srci);
+				tmp7 = __insn_dblalign(tmp7, tmp8, srci);
+
+				__insn_wh64(dst8);
+
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				tmp0 = tmp8;
+			}
+			src8--;
+		}
+
+		/* Copy the rest 8-byte chunks. */
+		if (n >= sizeof(op_t)) {
+			tmp0 = LD8(src8++);
+			for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
+				tmp1 = LD8(src8++);
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				ST8(dst8++, tmp0);
+				tmp0 = tmp1;
+			}
+			src8--;
+		}
+
+		if (n == 0)
+			return RETVAL;
+
+		tmp0 = LD8(src8++);
+		tmp1 = ((const char *)src8 <= src1_end)
+			? LD8((op_t *)src8) : 0;
+		final = __insn_dblalign(tmp0, tmp1, srci);
+
+	} else {
+		/* Aligned copy. */
+
+		const op_t *__restrict src8 = (const op_t *)src1;
+
+		/* src8 and dst8 are both word-aligned. */
+		if (n >= CHIP_L2_LINE_SIZE()) {
+			/* Copy until 'dst' is cache-line-aligned. */
+			for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
+			     n -= sizeof(op_t))
+				ST8(dst8++, LD8(src8++));
+
+			for (; n >= CHIP_L2_LINE_SIZE(); ) {
+				op_t tmp0, tmp1, tmp2, tmp3;
+				op_t tmp4, tmp5, tmp6, tmp7;
+
+				/*
+				 * Prefetch and advance to next line
+				 * to prefetch, but don't go past the
+				 * end.
+				 */
+				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
+				prefetch += CHIP_L2_LINE_SIZE();
+				prefetch = (prefetch < src1_end) ? prefetch :
+					(const char *)src8;
+
+				/*
+				 * Do all the loads before wh64.  This
+				 * is necessary if [src8, src8+7] and
+				 * [dst8, dst8+7] share the same cache
+				 * line and dst8 <= src8, as can be
+				 * the case when called from memmove,
+				 * or with code tested on x86 whose
+				 * memcpy always works with forward
+				 * copies.
+				 */
+				tmp0 = LD8(src8++);
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+
+				/* wh64 and wait for tmp7 load completion. */
+				__asm__ ("move %0, %0; wh64 %1\n"
+					 : : "r"(tmp7), "r"(dst8));
+
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				n -= CHIP_L2_LINE_SIZE();
+			}
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
+		}
+
+		for (; n >= sizeof(op_t); n -= sizeof(op_t))
+			ST8(dst8++, LD8(src8++));
+
+		if (__builtin_expect(n == 0, 1))
+			return RETVAL;
+
+		final = LD8(src8);
+	}
+
+	/* n != 0 if we get here.  Write out any trailing bytes. */
+	dst1 = (char *)dst8;
+#ifndef __BIG_ENDIAN__
+	if (n & 4) {
+		ST4((uint32_t *)dst1, final);
+		dst1 += 4;
+		final >>= 32;
+		n &= 3;
+	}
+	if (n & 2) {
+		ST2((uint16_t *)dst1, final);
+		dst1 += 2;
+		final >>= 16;
+		n &= 1;
+	}
+	if (n)
+		ST1((uint8_t *)dst1, final);
+#else
+	if (n & 4) {
+		ST4((uint32_t *)dst1, final >> 32);
+		dst1 += 4;
+        }
+        else
+        {
+		final >>= 32;
+        }
+	if (n & 2) {
+		ST2((uint16_t *)dst1, final >> 16);
+		dst1 += 2;
+        }
+        else
+        {
+		final >>= 16;
+        }
+	if (n & 1)
+		ST1((uint8_t *)dst1, final >> 8);
+#endif
+
+	return RETVAL;
+}
+
+#ifdef USERCOPY_FUNC
+#undef ST1
+#undef ST2
+#undef ST4
+#undef ST8
+#undef LD1
+#undef LD2
+#undef LD4
+#undef LD8
+#undef USERCOPY_FUNC
+#endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index f7d4a6ad61e..00000000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
-	void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
-	void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
-	void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
-  __insn_mtspr(SPR_SIM_CONTROL, \
-   SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache.  (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
-			      pte_t dst_pte, pte_t src_pte, int len)
-{
-	int idx;
-	unsigned long flags, newsrc, newdst;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int type0, type1;
-	int cpu = get_cpu();
-
-	/*
-	 * Disable interrupts so that we don't recurse into memcpy()
-	 * in an interrupt handler, nor accidentally reference
-	 * the PA of the source from an interrupt routine.  Also
-	 * notify the simulator that we're playing games so we don't
-	 * generate spurious coherency warnings.
-	 */
-	local_irq_save(flags);
-	sim_allow_multiple_caching(1);
-
-	/* Set up the new dest mapping */
-	type0 = kmap_atomic_idx_push();
-	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
-	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
-	ptep = pte_offset_kernel(pmdp, newdst);
-	if (pte_val(*ptep) != pte_val(dst_pte)) {
-		set_pte(ptep, dst_pte);
-		local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
-	}
-
-	/* Set up the new source mapping */
-	type1 = kmap_atomic_idx_push();
-	idx += (type0 - type1);
-	src_pte = hv_pte_set_nc(src_pte);
-	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
-	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
-	ptep = pte_offset_kernel(pmdp, newsrc);
-	*ptep = src_pte;   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/* Actually move the data. */
-	__memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
-	/*
-	 * Remap the source as locally-cached and not OLOC'ed so that
-	 * we can inval without also invaling the remote cpu's cache.
-	 * This also avoids known errata with inv'ing cacheable oloc data.
-	 */
-	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
-	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-	*ptep = src_pte;   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/*
-	 * Do the actual invalidation, covering the full L2 cache line
-	 * at the end since __memcpy_asm() is somewhat aggressive.
-	 */
-	__inv_buffer((void *)newsrc, len);
-
-	/*
-	 * We're done: notify the simulator that all is back to normal,
-	 * and re-enable interrupts and pre-emption.
-	 */
-	kmap_atomic_idx_pop();
-	kmap_atomic_idx_pop();
-	sim_allow_multiple_caching(0);
-	local_irq_restore(flags);
-	put_cpu();
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
-			       memcpy_t func)
-{
-	/*
-	 * Check if it's big enough to bother with.  We may end up doing a
-	 * small copy via TLB manipulation if we're near a page boundary,
-	 * but presumably we'll make it up when we hit the second page.
-	 */
-	while (len >= LARGE_COPY_CUTOFF) {
-		int copy_size, bytes_left_on_page;
-		pte_t *src_ptep, *dst_ptep;
-		pte_t src_pte, dst_pte;
-		struct page *src_page, *dst_page;
-
-		/* Is the source page oloc'ed to a remote cpu? */
-retry_source:
-		src_ptep = virt_to_pte(current->mm, (unsigned long)source);
-		if (src_ptep == NULL)
-			break;
-		src_pte = *src_ptep;
-		if (!hv_pte_get_present(src_pte) ||
-		    !hv_pte_get_readable(src_pte) ||
-		    hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
-			break;
-		if (get_remote_cache_cpu(src_pte) == smp_processor_id())
-			break;
-		src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
-		get_page(src_page);
-		if (pte_val(src_pte) != pte_val(*src_ptep)) {
-			put_page(src_page);
-			goto retry_source;
-		}
-		if (pte_huge(src_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(src_pte);
-			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			src_pte = pfn_pte(pfn, src_pte);
-			src_pte = pte_mksmall(src_pte);
-		}
-
-		/* Is the destination page writable? */
-retry_dest:
-		dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
-		if (dst_ptep == NULL) {
-			put_page(src_page);
-			break;
-		}
-		dst_pte = *dst_ptep;
-		if (!hv_pte_get_present(dst_pte) ||
-		    !hv_pte_get_writable(dst_pte)) {
-			put_page(src_page);
-			break;
-		}
-		dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
-		if (dst_page == src_page) {
-			/*
-			 * Source and dest are on the same page; this
-			 * potentially exposes us to incoherence if any
-			 * part of src and dest overlap on a cache line.
-			 * Just give up rather than trying to be precise.
-			 */
-			put_page(src_page);
-			break;
-		}
-		get_page(dst_page);
-		if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
-			put_page(dst_page);
-			goto retry_dest;
-		}
-		if (pte_huge(dst_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(dst_pte);
-			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			dst_pte = pfn_pte(pfn, dst_pte);
-			dst_pte = pte_mksmall(dst_pte);
-		}
-
-		/* All looks good: create a cachable PTE and copy from it */
-		copy_size = len;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
-		/* Release the pages */
-		put_page(dst_page);
-		put_page(src_page);
-
-		/* Continue on the next page */
-		dest += copy_size;
-		source += copy_size;
-		len -= copy_size;
-	}
-
-	return func(dest, source, len);
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return (void *)__memcpy_asm(to, from, n);
-	else
-		return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
-				      unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_to_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
-					unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
-				       unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_zeroing_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
new file mode 100644
index 00000000000..88c7016492c
--- /dev/null
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Do memcpy(), but trap and return "n" when a load or store faults.
+ *
+ * Note: this idiom only works when memcpy() compiles to a leaf function.
+ * Here leaf function not only means it does not have calls, but also
+ * requires no stack operations (sp, stack frame pointer) and no
+ * use of callee-saved registers, else "jrp lr" will be incorrect since
+ * unwinding stack frame is bypassed. Since memcpy() is not complex so
+ * these conditions are satisfied here, but we need to be careful when
+ * modifying this file. This is not a clean solution but is the best
+ * one so far.
+ *
+ * Also note that we are capturing "n" from the containing scope here.
+ */
+
+#define _ST(p, inst, v)						\
+	({							\
+		asm("1: " #inst " %0, %1;"			\
+		    ".pushsection .coldtext.memcpy,\"ax\";"	\
+		    "2: { move r0, %2; jrp lr };"		\
+		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
+		    ".quad 1b, 2b;"				\
+		    ".popsection"				\
+		    : "=m" (*(p)) : "r" (v), "r" (n));		\
+	})
+
+#define _LD(p, inst)						\
+	({							\
+		unsigned long __v;				\
+		asm("1: " #inst " %0, %1;"			\
+		    ".pushsection .coldtext.memcpy,\"ax\";"	\
+		    "2: { move r0, %2; jrp lr };"		\
+		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
+		    ".quad 1b, 2b;"				\
+		    ".popsection"				\
+		    : "=r" (__v) : "m" (*(p)), "r" (n));	\
+		__v;						\
+	})
+
+#define USERCOPY_FUNC __copy_to_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_from_user_inatomic
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_in_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
+				       unsigned long n)
+{
+	unsigned long rc = __copy_from_user_inatomic(to, from, n);
+	if (unlikely(rc))
+		memset(to + n - rc, 0, rc);
+	return rc;
+}
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff..2042bfe6595 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
 
 void *memset(void *s, int c, size_t n)
 {
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
 	int n32;
 	uint32_t v16, v32;
 	uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
-	int ahead32;
-#else
 	int to_align32;
-#endif
 
 	/* Experimentation shows that a trivial tight loop is a win up until
 	 * around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
 		return s;
 	}
 
-#if !CHIP_HAS_WH64()
-	/* Use a spare issue slot to start prefetching the first cache
-	 * line early. This instruction is free as the store can be buried
-	 * in otherwise idle issue slots doing ALU ops.
-	 */
-	__insn_prefetch(out8);
-
-	/* We prefetch the end so that a short memset that spans two cache
-	 * lines gets some prefetching benefit. Again we believe this is free
-	 * to issue.
-	 */
-	__insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
 	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
 	while (((uintptr_t) out8 & 3) != 0) {
 		*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
 
-#if !CHIP_HAS_WH64()
-
-	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
-	/* We already prefetched the first and last cache lines, so
-	 * we only need to do more prefetching if we are storing
-	 * to more than two cache lines.
-	 */
-	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
-		int i;
-
-		/* Prefetch the next several cache lines.
-		 * This is the setup code for the software-pipelined
-		 * loop below.
-		 */
-#define MAX_PREFETCH 5
-		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
-		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
-			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
-		for (i = CACHE_LINE_SIZE_IN_WORDS;
-		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
-			__insn_prefetch(&out32[i]);
-	}
-
-	if (n32 > ahead32) {
-		while (1) {
-			int j;
-
-			/* Prefetch by reading one word several cache lines
-			 * ahead.  Since loads are non-blocking this will
-			 * cause the full cache line to be read while we are
-			 * finishing earlier cache lines.  Using a store
-			 * here causes microarchitectural performance
-			 * problems where a victimizing store miss goes to
-			 * the head of the retry FIFO and locks the pipe for
-			 * a few cycles.  So a few subsequent stores in this
-			 * loop go into the retry FIFO, and then later
-			 * stores see other stores to the same cache line
-			 * are already in the retry FIFO and themselves go
-			 * into the retry FIFO, filling it up and grinding
-			 * to a halt waiting for the original miss to be
-			 * satisfied.
-			 */
-			__insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
-			n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
-			/* Save icache space by only partially unrolling
-			 * this loop.
-			 */
-			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-			}
-
-			/* To save compiled code size, reuse this loop even
-			 * when we run out of prefetching to do by dropping
-			 * ahead32 down.
-			 */
-			if (n32 <= ahead32) {
-				/* Not even a full cache line left,
-				 * so stop now.
-				 */
-				if (n32 < CACHE_LINE_SIZE_IN_WORDS)
-					break;
-
-				/* Choose a small enough value that we don't
-				 * prefetch past the end.  There's no sense
-				 * in touching cache lines we don't have to.
-				 */
-				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
-			}
-		}
-	}
-
-#else /* CHIP_HAS_WH64() */
-
 	/* Determine how many words we need to emit before the 'out32'
 	 * pointer becomes aligned modulo the cache line size.
 	 */
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
 		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 	}
 
-#endif /* CHIP_HAS_WH64() */
-
 	/* Now handle any leftover values. */
 	if (n32 != 0) {
 		do {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
new file mode 100644
index 00000000000..03ef69cd73d
--- /dev/null
+++ b/arch/tile/lib/memset_64.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <arch/chip.h>
+#include "string-endian.h"
+
+void *memset(void *s, int c, size_t n)
+{
+	uint64_t *out64;
+	int n64, to_align64;
+	uint64_t v64;
+	uint8_t *out8 = s;
+
+	/* Experimentation shows that a trivial tight loop is a win up until
+	 * around a size of 20, where writing a word at a time starts to win.
+	 */
+#define BYTE_CUTOFF 20
+
+#if BYTE_CUTOFF < 7
+	/* This must be at least at least this big, or some code later
+	 * on doesn't work.
+	 */
+#error "BYTE_CUTOFF is too small"
+#endif
+
+	if (n < BYTE_CUTOFF) {
+		/* Strangely, this turns out to be the tightest way to
+		 * write this loop.
+		 */
+		if (n != 0) {
+			do {
+				/* Strangely, combining these into one line
+				 * performs worse.
+				 */
+				*out8 = c;
+				out8++;
+			} while (--n != 0);
+		}
+
+		return s;
+	}
+
+	/* Align 'out8'. We know n >= 7 so this won't write past the end. */
+	while (((uintptr_t) out8 & 7) != 0) {
+		*out8++ = c;
+		--n;
+	}
+
+	/* Align 'n'. */
+	while (n & 7)
+		out8[--n] = c;
+
+	out64 = (uint64_t *) out8;
+	n64 = n >> 3;
+
+	/* Tile input byte out to 64 bits. */
+	v64 = copy_byte(c);
+
+	/* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
+
+	/* Determine how many words we need to emit before the 'out32'
+	 * pointer becomes aligned modulo the cache line size.
+	 */
+	to_align64 = (-((uintptr_t)out64 >> 3)) &
+		(CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
+
+	/* Only bother aligning and using wh64 if there is at least
+	 * one full cache line to process.  This check also prevents
+	 * overrunning the end of the buffer with alignment words.
+	 */
+	if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
+		int lines_left;
+
+		/* Align out64 mod the cache line size so we can use wh64. */
+		n64 -= to_align64;
+		for (; to_align64 != 0; to_align64--) {
+			*out64 = v64;
+			out64++;
+		}
+
+		/* Use unsigned divide to turn this into a right shift. */
+		lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+
+		do {
+			/* Only wh64 a few lines at a time, so we don't
+			 * exceed the maximum number of victim lines.
+			 */
+			int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+				  ? lines_left
+				  : CHIP_MAX_OUTSTANDING_VICTIMS());
+			uint64_t *wh = out64;
+			int i = x;
+			int j;
+
+			lines_left -= x;
+
+			do {
+				__insn_wh64(wh);
+				wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+			} while (--i);
+
+			for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
+			     j != 0; j--) {
+				*out64++ = v64;
+				*out64++ = v64;
+				*out64++ = v64;
+				*out64++ = v64;
+			}
+		} while (lines_left != 0);
+
+		/* We processed all full lines above, so only this many
+		 * words remain to be processed.
+		 */
+		n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
+	}
+
+	/* Now handle any leftover values. */
+	if (n64 != 0) {
+		do {
+			*out64 = v64;
+			out64++;
+		} while (--n64 != 0);
+	}
+
+	return s;
+}
+EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 485e24d62c6..b34f79aada4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <asm/processor.h>
+#include <arch/spr_def.h>
 
 #include "spinlock_common.h"
 
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
 #define RD_COUNT_MASK   ((1 << RD_COUNT_WIDTH) - 1)
 
 
-/* Lock the word, spinning until there are no tns-ers. */
-static inline u32 get_rwlock(arch_rwlock_t *rwlock)
-{
-	u32 iterations = 0;
-	for (;;) {
-		u32 val = __insn_tns((int *)&rwlock->lock);
-		if (unlikely(val & 1)) {
-			delay_backoff(iterations++);
-			continue;
-		}
-		return val;
-	}
-}
-
-int arch_read_trylock_slow(arch_rwlock_t *rwlock)
-{
-	u32 val = get_rwlock(rwlock);
-	int locked = (val << RD_COUNT_WIDTH) == 0;
-	rwlock->lock = val + (locked << RD_COUNT_SHIFT);
-	return locked;
-}
-EXPORT_SYMBOL(arch_read_trylock_slow);
-
-void arch_read_unlock_slow(arch_rwlock_t *rwlock)
-{
-	u32 val = get_rwlock(rwlock);
-	rwlock->lock = val - (1 << RD_COUNT_SHIFT);
-}
-EXPORT_SYMBOL(arch_read_unlock_slow);
-
-void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We can get the read lock if everything but the reader bits (which
+ * are in the high part of the word) is zero, i.e. no active or
+ * waiting writers, no tns.
+ *
+ * We guard the tns/store-back with an interrupt critical section to
+ * preserve the semantic that the same read lock can be acquired in an
+ * interrupt context.
+ */
+int arch_read_trylock(arch_rwlock_t *rwlock)
 {
-	u32 eq, mask = 1 << WR_CURR_SHIFT;
-	while (unlikely(val & 1)) {
-		/* Limited backoff since we are the highest-priority task. */
-		relax(4);
-		val = __insn_tns((int *)&rwlock->lock);
+	u32 val;
+	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+	val = __insn_tns((int *)&rwlock->lock);
+	if (likely((val << _RD_COUNT_WIDTH) == 0)) {
+		val += 1 << RD_COUNT_SHIFT;
+		rwlock->lock = val;
+		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+		BUG_ON(val == 0);  /* we don't expect wraparound */
+		return 1;
 	}
-	val = __insn_addb(val, mask);
-	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
-	val = __insn_mz(eq & mask, val);
-	rwlock->lock = val;
+	if ((val & 1) == 0)
+		rwlock->lock = val;
+	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+	return 0;
 }
-EXPORT_SYMBOL(arch_write_unlock_slow);
+EXPORT_SYMBOL(arch_read_trylock);
 
 /*
- * We spin until everything but the reader bits (which are in the high
- * part of the word) are zero, i.e. no active or waiting writers, no tns.
- *
+ * Spin doing arch_read_trylock() until we acquire the lock.
  * ISSUE: This approach can permanently starve readers.  A reader who sees
  * a writer could instead take a ticket lock (just like a writer would),
  * and atomically enter read mode (with 1 reader) when it gets the ticket.
- * This way both readers and writers will always make forward progress
+ * This way both readers and writers would always make forward progress
  * in a finite time.
  */
-void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
+void arch_read_lock(arch_rwlock_t *rwlock)
 {
 	u32 iterations = 0;
-	do {
-		if (!(val & 1))
-			rwlock->lock = val;
+	while (unlikely(!arch_read_trylock(rwlock)))
 		delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_read_lock);
+
+void arch_read_unlock(arch_rwlock_t *rwlock)
+{
+	u32 val, iterations = 0;
+
+	mb();  /* guarantee anything modified under the lock is visible */
+	for (;;) {
+		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
 		val = __insn_tns((int *)&rwlock->lock);
-	} while ((val << RD_COUNT_WIDTH) != 0);
-	rwlock->lock = val + (1 << RD_COUNT_SHIFT);
+		if (likely((val & 1) == 0)) {
+			rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
+			__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+			break;
+		}
+		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+		delay_backoff(iterations++);
+	}
 }
-EXPORT_SYMBOL(arch_read_lock_slow);
+EXPORT_SYMBOL(arch_read_unlock);
 
-void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We don't need an interrupt critical section here (unlike for
+ * arch_read_lock) since we should never use a bare write lock where
+ * it could be interrupted by code that could try to re-acquire it.
+ */
+void arch_write_lock(arch_rwlock_t *rwlock)
 {
 	/*
 	 * The trailing underscore on this variable (and curr_ below)
@@ -167,23 +168,36 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
 	 * when we compare them.
 	 */
 	u32 my_ticket_;
+	u32 iterations = 0;
+	u32 val = __insn_tns((int *)&rwlock->lock);
 
-	/* Take out the next ticket; this will also stop would-be readers. */
-	if (val & 1)
-		val = get_rwlock(rwlock);
-	rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
-
-	/* Extract my ticket value from the original word. */
-	my_ticket_ = val >> WR_NEXT_SHIFT;
+	if (likely(val == 0)) {
+		rwlock->lock = 1 << _WR_NEXT_SHIFT;
+		return;
+	}
 
 	/*
-	 * Wait until the "current" field matches our ticket, and
-	 * there are no remaining readers.
+	 * Wait until there are no readers, then bump up the next
+	 * field and capture the ticket value.
 	 */
 	for (;;) {
+		if (!(val & 1)) {
+			if ((val >> RD_COUNT_SHIFT) == 0)
+				break;
+			rwlock->lock = val;
+		}
+		delay_backoff(iterations++);
+		val = __insn_tns((int *)&rwlock->lock);
+	}
+
+	/* Take out the next ticket and extract my ticket value. */
+	rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
+	my_ticket_ = val >> WR_NEXT_SHIFT;
+
+	/* Wait until the "current" field matches our ticket. */
+	for (;;) {
 		u32 curr_ = val >> WR_CURR_SHIFT;
-		u32 readers = val >> RD_COUNT_SHIFT;
-		u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers;
+		u32 delta = ((my_ticket_ - curr_) & WR_MASK);
 		if (likely(delta == 0))
 			break;
 
@@ -199,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
 			relax(4);
 	}
 }
-EXPORT_SYMBOL(arch_write_lock_slow);
+EXPORT_SYMBOL(arch_write_lock);
 
-int __tns_atomic_acquire(atomic_t *lock)
+int arch_write_trylock(arch_rwlock_t *rwlock)
 {
-	int ret;
-	u32 iterations = 0;
+	u32 val = __insn_tns((int *)&rwlock->lock);
 
-	BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
-	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+	/*
+	 * If a tns is in progress, or there's a waiting or active locker,
+	 * or active readers, we can't take the lock, so give up.
+	 */
+	if (unlikely(val != 0)) {
+		if (!(val & 1))
+			rwlock->lock = val;
+		return 0;
+	}
 
-	while ((ret = __insn_tns((void *)&lock->counter)) == 1)
-		delay_backoff(iterations++);
-	return ret;
+	/* Set the "next" field to mark it locked. */
+	rwlock->lock = 1 << _WR_NEXT_SHIFT;
+	return 1;
 }
+EXPORT_SYMBOL(arch_write_trylock);
 
-void __tns_atomic_release(atomic_t *p, int v)
+void arch_write_unlock(arch_rwlock_t *rwlock)
 {
-	p->counter = v;
-	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+	u32 val, eq, mask;
+
+	mb();  /* guarantee anything modified under the lock is visible */
+	val = __insn_tns((int *)&rwlock->lock);
+	if (likely(val == (1 << _WR_NEXT_SHIFT))) {
+		rwlock->lock = 0;
+		return;
+	}
+	while (unlikely(val & 1)) {
+		/* Limited backoff since we are the highest-priority task. */
+		relax(4);
+		val = __insn_tns((int *)&rwlock->lock);
+	}
+	mask = 1 << WR_CURR_SHIFT;
+	val = __insn_addb(val, mask);
+	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+	val = __insn_mz(eq & mask, val);
+	rwlock->lock = val;
 }
+EXPORT_SYMBOL(arch_write_unlock);
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
new file mode 100644
index 00000000000..d6fb9581e98
--- /dev/null
+++ b/arch/tile/lib/spinlock_64.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+
+#include "spinlock_common.h"
+
+/*
+ * Read the spinlock value without allocating in our cache and without
+ * causing an invalidation to another cpu with a copy of the cacheline.
+ * This is important when we are spinning waiting for the lock.
+ */
+static inline u32 arch_spin_read_noalloc(void *lock)
+{
+	return atomic_cmpxchg((atomic_t *)lock, -1, -1);
+}
+
+/*
+ * Wait until the high bits (current) match my ticket.
+ * If we notice the overflow bit set on entry, we clear it.
+ */
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
+{
+	if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
+		__insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
+		my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
+	}
+
+	for (;;) {
+		u32 val = arch_spin_read_noalloc(lock);
+		u32 delta = my_ticket - arch_spin_current(val);
+		if (delta == 0)
+			return;
+		relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
+	}
+}
+EXPORT_SYMBOL(arch_spin_lock_slow);
+
+/*
+ * Check the lock to see if it is plausible, and try to get it with cmpxchg().
+ */
+int arch_spin_trylock(arch_spinlock_t *lock)
+{
+	u32 val = arch_spin_read_noalloc(lock);
+	if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
+		return 0;
+	return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
+		== val;
+}
+EXPORT_SYMBOL(arch_spin_trylock);
+
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	u32 iterations = 0;
+	while (arch_spin_is_locked(lock))
+		delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_spin_unlock_wait);
+
+/*
+ * If the read lock fails due to a writer, we retry periodically
+ * until the value is positive and we write our incremented reader count.
+ */
+void __read_lock_failed(arch_rwlock_t *rw)
+{
+	u32 val;
+	int iterations = 0;
+	do {
+		delay_backoff(iterations++);
+		val = __insn_fetchaddgez4(&rw->lock, 1);
+	} while (unlikely(arch_write_val_locked(val)));
+}
+EXPORT_SYMBOL(__read_lock_failed);
+
+/*
+ * If we failed because there were readers, clear the "writer" bit
+ * so we don't block additional readers.  Otherwise, there was another
+ * writer anyway, so our "fetchor" made no difference.  Then wait,
+ * issuing periodic fetchor instructions, till we get the lock.
+ */
+void __write_lock_failed(arch_rwlock_t *rw, u32 val)
+{
+	int iterations = 0;
+	do {
+		if (!arch_write_val_locked(val))
+			val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+		delay_backoff(iterations++);
+		val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+	} while (val != 0);
+}
+EXPORT_SYMBOL(__write_lock_failed);
diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h
index c1010980913..6ac37509fac 100644
--- a/arch/tile/lib/spinlock_common.h
+++ b/arch/tile/lib/spinlock_common.h
@@ -60,5 +60,5 @@ static void delay_backoff(int iterations)
 	loops += __insn_crc32_32(stack_pointer, get_cycles_low()) &
 		(loops - 1);
 
-	relax(1 << exponent);
+	relax(loops);
 }
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
index c94e6f7ae7b..841fe696301 100644
--- a/arch/tile/lib/strchr_32.c
+++ b/arch/tile/lib/strchr_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strchr
-
 char *strchr(const char *s, int c)
 {
 	int z, g;
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
new file mode 100644
index 00000000000..fe6e31c06f8
--- /dev/null
+++ b/arch/tile/lib/strchr_64.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+char *strchr(const char *s, int c)
+{
+	int z, g;
+
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+	/* Create eight copies of the byte for which we are looking. */
+	const uint64_t goal = copy_byte(c);
+
+	/* Read the first aligned word, but force bytes before the string to
+	 * match neither zero nor goal (we make sure the high bit of each
+	 * byte is 1, and the low 7 bits are all the opposite of the goal
+	 * byte).
+	 */
+	const uint64_t before_mask = MASK(s_int);
+	uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1));
+
+	uint64_t zero_matches, goal_matches;
+	while (1) {
+		/* Look for a terminating '\0'. */
+		zero_matches = __insn_v1cmpeqi(v, 0);
+
+		/* Look for the goal byte. */
+		goal_matches = __insn_v1cmpeq(v, goal);
+
+		if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
+			break;
+
+		v = *++p;
+	}
+
+	z = CFZ(zero_matches);
+	g = CFZ(goal_matches);
+
+	/* If we found c before '\0' we got a match. Note that if c == '\0'
+	 * then g == z, and we correctly return the address of the '\0'
+	 * rather than NULL.
+	 */
+	return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
+}
+EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h
new file mode 100644
index 00000000000..2e49cbfe937
--- /dev/null
+++ b/arch/tile/lib/string-endian.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Provide a mask based on the pointer alignment that
+ * sets up non-zero bytes before the beginning of the string.
+ * The MASK expression works because shift counts are taken mod 64.
+ * Also, specify how to count "first" and "last" bits
+ * when the bits have been read as a word.
+ */
+
+#include <asm/byteorder.h>
+
+#ifdef __LITTLE_ENDIAN
+#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1)
+#define NULMASK(x) ((2ULL << x) - 1)
+#define CFZ(x) __insn_ctz(x)
+#define REVCZ(x) __insn_clz(x)
+#else
+#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1)))
+#define NULMASK(x) (-2LL << (63 - x))
+#define CFZ(x) __insn_clz(x)
+#define REVCZ(x) __insn_ctz(x)
+#endif
+
+/*
+ * Create eight copies of the byte in a uint64_t.  Byte Shuffle uses
+ * the bytes of srcB as the index into the dest vector to select a
+ * byte.  With all indices of zero, the first byte is copied into all
+ * the other bytes.
+ */
+static inline uint64_t copy_byte(uint8_t byte)
+{
+	return __insn_shufflebytes(byte, 0, 0);
+}
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index 4974292a553..f26f88e11e4 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strlen
-
 size_t strlen(const char *s)
 {
 	/* Get an aligned pointer. */
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
new file mode 100644
index 00000000000..9583fc3361f
--- /dev/null
+++ b/arch/tile/lib/strlen_64.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strlen(const char *s)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+	/* Read and MASK the first word. */
+	uint64_t v = *p | MASK(s_int);
+
+	uint64_t bits;
+	while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
+		v = *++p;
+
+	return ((const char *)p) + (CFZ(bits) >> 3) - s;
+}
+EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c
new file mode 100644
index 00000000000..1434141d9e0
--- /dev/null
+++ b/arch/tile/lib/strnlen_32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint32_t *p = (const uint32_t *)(s_int & -4);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint32_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read first word, but force bytes before the string to be nonzero. */
+	v = *p | ((1 << ((s_int << 3) & 31)) - 1);
+
+	while ((bits = __insn_seqb(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c
new file mode 100644
index 00000000000..2e8de6a5136
--- /dev/null
+++ b/arch/tile/lib/strnlen_64.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint64_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read and MASK the first word. */
+	v = *p | MASK(s_int);
+
+	while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (CFZ(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c
index f8d398c9ee7..030abe3ee4f 100644
--- a/arch/tile/lib/uaccess.c
+++ b/arch/tile/lib/uaccess.c
@@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size)
 		 is_arch_mappable_range(addr, size));
 }
 EXPORT_SYMBOL(__range_ok);
-
-#ifdef CONFIG_DEBUG_COPY_FROM_USER
-void copy_from_user_overflow(void)
-{
-       WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
-#endif
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
index 979f76d8374..1bc16222463 100644
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -19,82 +19,6 @@
 
 /* Access user memory, but use MMU to avoid propagating kernel exceptions. */
 
-	.pushsection .fixup,"ax"
-
-get_user_fault:
-	{ move r0, zero; move r1, zero }
-	{ movei r2, -EFAULT; jrp lr }
-	ENDPROC(get_user_fault)
-
-put_user_fault:
-	{ movei r0, -EFAULT; jrp lr }
-	ENDPROC(put_user_fault)
-
-	.popsection
-
-/*
- * __get_user_N functions take a pointer in r0, and return 0 in r2
- * on success, with the value in r0; or else -EFAULT in r2.
- */
-#define __get_user_N(bytes, LOAD) \
-	STD_ENTRY(__get_user_##bytes); \
-1:	{ LOAD r0, r0; move r1, zero; move r2, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__get_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.word 1b, get_user_fault; \
-	.popsection
-
-__get_user_N(1, lb_u)
-__get_user_N(2, lh_u)
-__get_user_N(4, lw)
-
-/*
- * __get_user_8 takes a pointer in r0, and returns 0 in r2
- * on success, with the value in r0/r1; or else -EFAULT in r2.
- */
-	STD_ENTRY(__get_user_8);
-1:	{ lw r0, r0; addi r1, r0, 4 };
-2:	{ lw r1, r1; move r2, zero };
-	jrp lr;
-	STD_ENDPROC(__get_user_8);
-	.pushsection __ex_table,"a";
-	.word 1b, get_user_fault;
-	.word 2b, get_user_fault;
-	.popsection
-
-/*
- * __put_user_N functions take a value in r0 and a pointer in r1,
- * and return 0 in r0 on success or -EFAULT on failure.
- */
-#define __put_user_N(bytes, STORE) \
-	STD_ENTRY(__put_user_##bytes); \
-1:	{ STORE r1, r0; move r0, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__put_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.word 1b, put_user_fault; \
-	.popsection
-
-__put_user_N(1, sb)
-__put_user_N(2, sh)
-__put_user_N(4, sw)
-
-/*
- * __put_user_8 takes a value in r0/r1 and a pointer in r2,
- * and returns 0 in r0 on success or -EFAULT on failure.
- */
-STD_ENTRY(__put_user_8)
-1:      { sw r2, r0; addi r2, r2, 4 }
-2:      { sw r2, r1; move r0, zero }
-	jrp lr
-	STD_ENDPROC(__put_user_8)
-	.pushsection __ex_table,"a"
-	.word 1b, put_user_fault
-	.word 2b, put_user_fault
-	.popsection
-
-
 /*
  * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
  * It returns the length, including the terminating NUL, or zero on exception.
@@ -112,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strnlen_user_fault
 	.popsection
 
@@ -123,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ bz r2, 2f; move r3, r0 }
-1:      { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ sb r0, r4; addi r0, r0, 1 }
-	bz r2, 2f
-	bnzt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	bz r4, 2f
+	bnzt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strncpy_from_user_fault
 	.popsection
 
@@ -153,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnzt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -162,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -181,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.word 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	bz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -219,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
new file mode 100644
index 00000000000..b3b31a3306f
--- /dev/null
+++ b/arch/tile/lib/usercopy_64.S
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cache.h>
+#include <arch/chip.h>
+
+/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
+
+/*
+ * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
+ * It returns the length, including the terminating NUL, or zero on exception.
+ * If length is greater than the bound, returns one plus the bound.
+ */
+STD_ENTRY(strnlen_user_asm)
+	{ beqz r1, 2f; addi r3, r0, -1 }  /* bias down to include NUL */
+1:      { ld1u r4, r0; addi r1, r1, -1 }
+	beqz r4, 2f
+	{ bnezt r1, 1b; addi r0, r0, 1 }
+2:      { sub r0, r0, r3; jrp lr }
+	STD_ENDPROC(strnlen_user_asm)
+	.pushsection .fixup,"ax"
+strnlen_user_fault:
+	{ move r0, zero; jrp lr }
+	ENDPROC(strnlen_user_fault)
+	.section __ex_table,"a"
+	.align 8
+	.quad 1b, strnlen_user_fault
+	.popsection
+
+/*
+ * strncpy_from_user_asm takes the kernel target pointer in r0,
+ * the userspace source pointer in r1, and the length bound (including
+ * the trailing NUL) in r2.  On success, it returns the string length
+ * (not including the trailing NUL), or -EFAULT on failure.
+ */
+STD_ENTRY(strncpy_from_user_asm)
+	{ beqz r2, 2f; move r3, r0 }
+1:	{ ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+	{ st1 r0, r4; addi r0, r0, 1 }
+	beqz r4, 2f
+	bnezt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
+	STD_ENDPROC(strncpy_from_user_asm)
+	.pushsection .fixup,"ax"
+strncpy_from_user_fault:
+	{ movei r0, -EFAULT; jrp lr }
+	ENDPROC(strncpy_from_user_fault)
+	.section __ex_table,"a"
+	.align 8
+	.quad 1b, strncpy_from_user_fault
+	.popsection
+
+/*
+ * clear_user_asm takes the user target address in r0 and the
+ * number of bytes to zero in r1.
+ * It returns the number of uncopiable bytes (hopefully zero) in r0.
+ * Note that we don't use a separate .fixup section here since we fall
+ * through into the "fixup" code as the last straight-line bundle anyway.
+ */
+STD_ENTRY(clear_user_asm)
+	{ beqz r1, 2f; or r2, r0, r1 }
+	andi r2, r2, 7
+	beqzt r2, .Lclear_aligned_user_asm
+1:      { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
+	bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+	.pushsection __ex_table,"a"
+	.align 8
+	.quad 1b, 2b
+	.popsection
+
+.Lclear_aligned_user_asm:
+1:      { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
+	bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(clear_user_asm)
+	.pushsection __ex_table,"a"
+	.align 8
+	.quad 1b, 2b
+	.popsection
+
+/*
+ * flush_user_asm takes the user target address in r0 and the
+ * number of bytes to flush in r1.
+ * It returns the number of unflushable bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(flush_user_asm)
+	beqz r1, 2f
+	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+	{ and r0, r0, r2; and r1, r1, r2 }
+	{ sub r1, r1, r0 }
+1:      { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
+	{ addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(flush_user_asm)
+	.pushsection __ex_table,"a"
+	.align 8
+	.quad 1b, 2b
+	.popsection
+
+/*
+ * finv_user_asm takes the user target address in r0 and the
+ * number of bytes to flush-invalidate in r1.
+ * It returns the number of not finv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(finv_user_asm)
+	beqz r1, 2f
+	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+	{ and r0, r0, r2; and r1, r1, r2 }
+	{ sub r1, r1, r0 }
+1:      { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
+	{ addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(finv_user_asm)
+	.pushsection __ex_table,"a"
+	.align 8
+	.quad 1b, 2b
+	.popsection