aboutsummaryrefslogtreecommitdiff
path: root/arch/tile/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/Makefile20
-rw-r--r--arch/tile/lib/atomic_32.c187
-rw-r--r--arch/tile/lib/atomic_asm_32.S7
-rw-r--r--arch/tile/lib/cacheflush.c150
-rw-r--r--arch/tile/lib/checksum.c15
-rw-r--r--arch/tile/lib/cpumask.c2
-rw-r--r--arch/tile/lib/delay.c21
-rw-r--r--arch/tile/lib/exports.c32
-rw-r--r--arch/tile/lib/mb_incoherent.S34
-rw-r--r--arch/tile/lib/memchr_32.c35
-rw-r--r--arch/tile/lib/memchr_64.c69
-rw-r--r--arch/tile/lib/memcpy_32.S63
-rw-r--r--arch/tile/lib/memcpy_64.c367
-rw-r--r--arch/tile/lib/memcpy_tile64.c276
-rw-r--r--arch/tile/lib/memcpy_user_64.c94
-rw-r--r--arch/tile/lib/memset_32.c110
-rw-r--r--arch/tile/lib/memset_64.c142
-rw-r--r--arch/tile/lib/spinlock_32.c190
-rw-r--r--arch/tile/lib/spinlock_64.c104
-rw-r--r--arch/tile/lib/spinlock_common.h2
-rw-r--r--arch/tile/lib/strchr_32.c2
-rw-r--r--arch/tile/lib/strchr_64.c62
-rw-r--r--arch/tile/lib/string-endian.h44
-rw-r--r--arch/tile/lib/strlen_32.c2
-rw-r--r--arch/tile/lib/strlen_64.c35
-rw-r--r--arch/tile/lib/strnlen_32.c47
-rw-r--r--arch/tile/lib/strnlen_64.c48
-rw-r--r--arch/tile/lib/uaccess.c8
-rw-r--r--arch/tile/lib/usercopy_32.S112
-rw-r--r--arch/tile/lib/usercopy_64.S135
30 files changed, 1518 insertions, 897 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 93122d5b155..c4211cbb202 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -2,17 +2,17 @@
# Makefile for TILE-specific library files..
#
-lib-y = cacheflush.o checksum.o cpumask.o delay.o \
- mb_incoherent.o uaccess.o memmove.o \
- memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
- strchr_$(BITS).o strlen_$(BITS).o
-
-ifeq ($(CONFIG_TILEGX),y)
-lib-y += memcpy_user_64.o
-else
-lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
-endif
+lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
+ memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
+ strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
+lib-$(CONFIG_TILEGX) += memcpy_user_64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
obj-$(CONFIG_MODULES) += exports.o
+
+# The finv_buffer_remote() and copy_{to,from}_user() routines can't
+# have -pg added, since they both rely on being leaf functions.
+CFLAGS_REMOVE_cacheflush.o = -pg
+CFLAGS_REMOVE_memcpy_user_64.o = -pg
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 7a5cc706ab6..c89b211fd9e 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -17,55 +17,15 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <asm/atomic.h>
-#include <asm/futex.h>
+#include <linux/atomic.h>
#include <arch/chip.h>
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
- int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
- __write_once = {
- [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
/* This page is remapped on startup to be hash-for-home. */
-int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
- __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
+int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
-static inline int *__atomic_hashed_lock(volatile void *v)
+int *__atomic_hashed_lock(volatile void *v)
{
- /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
- unsigned long i =
- (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
- unsigned long n = __insn_crc32_32(0, i);
-
- /* Grab high bits for L1 index. */
- unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
- /* Grab low bits for L2 index. */
- unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
- return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
+ /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
/*
* Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
* Using mm works here because atomic_locks is page aligned.
@@ -74,26 +34,13 @@ static inline int *__atomic_hashed_lock(volatile void *v)
(unsigned long)atomic_locks,
2, (ATOMIC_HASH_SHIFT + 2) - 1);
return (int *)ptr;
-#endif
}
#ifdef CONFIG_SMP
/* Return whether the passed pointer is a valid atomic lock pointer. */
static int is_atomic_lock(int *p)
{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
- int i;
- for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
- if (p >= &atomic_lock_ptr[i]->lock[0] &&
- p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
- return 1;
- }
- }
- return 0;
-#else
return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
}
void __atomic_fault_unlock(int *irqlock_word)
@@ -112,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)
return __atomic_hashed_lock(v);
}
-int _atomic_xchg(atomic_t *v, int n)
+int _atomic_xchg(int *v, int n)
{
- return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
+ return __atomic_xchg(v, __atomic_setup(v), n).val;
}
EXPORT_SYMBOL(_atomic_xchg);
-int _atomic_xchg_add(atomic_t *v, int i)
+int _atomic_xchg_add(int *v, int i)
{
- return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
+ return __atomic_xchg_add(v, __atomic_setup(v), i).val;
}
EXPORT_SYMBOL(_atomic_xchg_add);
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
+int _atomic_xchg_add_unless(int *v, int a, int u)
{
/*
* Note: argument order is switched here since it is easier
* to use the first argument consistently as the "old value"
* in the assembly, as is done for _atomic_cmpxchg().
*/
- return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
- .val;
+ return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
}
EXPORT_SYMBOL(_atomic_xchg_add_unless);
-int _atomic_cmpxchg(atomic_t *v, int o, int n)
+int _atomic_cmpxchg(int *v, int o, int n)
{
- return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
+ return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
}
EXPORT_SYMBOL(_atomic_cmpxchg);
@@ -161,78 +107,36 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
EXPORT_SYMBOL(_atomic_xor);
-u64 _atomic64_xchg(atomic64_t *v, u64 n)
+long long _atomic64_xchg(long long *v, long long n)
{
- return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
+ return __atomic64_xchg(v, __atomic_setup(v), n);
}
EXPORT_SYMBOL(_atomic64_xchg);
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
+long long _atomic64_xchg_add(long long *v, long long i)
{
- return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
+ return __atomic64_xchg_add(v, __atomic_setup(v), i);
}
EXPORT_SYMBOL(_atomic64_xchg_add);
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
+long long _atomic64_xchg_add_unless(long long *v, long long a, long long u)
{
/*
* Note: argument order is switched here since it is easier
* to use the first argument consistently as the "old value"
* in the assembly, as is done for _atomic_cmpxchg().
*/
- return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
- u, a);
+ return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
}
EXPORT_SYMBOL(_atomic64_xchg_add_unless);
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+long long _atomic64_cmpxchg(long long *v, long long o, long long n)
{
- return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
+ return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
}
EXPORT_SYMBOL(_atomic64_cmpxchg);
-static inline int *__futex_setup(int __user *v)
-{
- /*
- * Issue a prefetch to the counter to bring it into cache.
- * As for __atomic_setup, but we can't do a read into the L1
- * since it might fault; instead we do a prefetch into the L2.
- */
- __insn_prefetch(v);
- return __atomic_hashed_lock((int __force *)v);
-}
-
-struct __get_user futex_set(int __user *v, int i)
-{
- return __atomic_xchg((int __force *)v, __futex_setup(v), i);
-}
-
-struct __get_user futex_add(int __user *v, int n)
-{
- return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_or(int __user *v, int n)
-{
- return __atomic_or((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_andn(int __user *v, int n)
-{
- return __atomic_andn((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_xor(int __user *v, int n)
-{
- return __atomic_xor((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_cmpxchg(int __user *v, int o, int n)
-{
- return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
-}
-
/*
* If any of the atomic or futex routines hit a bad address (not in
* the page tables at kernel PL) this routine is called. The futex
@@ -251,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)
}
-#if CHIP_HAS_CBOX_HOME_MAP()
-static int __init noatomichash(char *str)
-{
- pr_warning("noatomichash is deprecated.\n");
- return 1;
-}
-__setup("noatomichash", noatomichash);
-#endif
-
void __init __init_atomic_per_cpu(void)
{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
- unsigned int i;
- int actual_cpu;
-
- /*
- * Before this is called from setup, we just have one lock for
- * all atomic objects/operations. Here we replace the
- * elements of atomic_lock_ptr so that they point at per_cpu
- * integers. This seemingly over-complex approach stems from
- * the fact that DEFINE_PER_CPU defines an entry for each cpu
- * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But
- * for efficient hashing of atomics to their locks we want a
- * compile time constant power of 2 for the size of this
- * table, so we use ATOMIC_HASH_SIZE.
- *
- * Here we populate atomic_lock_ptr from the per cpu
- * atomic_lock_pool, interspersing by actual cpu so that
- * subsequent elements are homed on consecutive cpus.
- */
-
- actual_cpu = cpumask_first(cpu_possible_mask);
-
- for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
- /*
- * Preincrement to slightly bias against using cpu 0,
- * which has plenty of stuff homed on it already.
- */
- actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
- if (actual_cpu >= nr_cpu_ids)
- actual_cpu = cpumask_first(cpu_possible_mask);
-
- atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
- }
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
/* Validate power-of-two and "bigger than cpus" assumption */
BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -322,9 +180,4 @@ void __init __init_atomic_per_cpu(void)
* That should not produce more indices than ATOMIC_HASH_SIZE.
*/
BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
- /* The futex code makes this assumption, so we validate it here. */
- BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
}
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 5a5514b77e7..6bda3132cd6 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
* Support routines for atomic operations. Each function takes:
*
* r0: address to manipulate
- * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
* r2: new value to write, or for cmpxchg/add_unless, value to compare against
* r3: (cmpxchg/xchg_add_unless) new value to write or add;
* (atomic64 ops) high word of value to write
@@ -59,7 +59,7 @@
* bad kernel addresses).
*
* Note that if the value we would store is the same as what we
- * loaded, we bypass the load. Other platforms with true atomics can
+ * loaded, we bypass the store. Other platforms with true atomics can
* make the guarantee that a non-atomic __clear_bit(), for example,
* can safely race with an atomic test_and_set_bit(); this example is
* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
@@ -70,7 +70,7 @@
*/
#include <linux/linkage.h>
-#include <asm/atomic.h>
+#include <asm/atomic_32.h>
#include <asm/page.h>
#include <asm/processor.h>
@@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)
STD_ENDPROC(__atomic\name)
.ifc \bitwidth,32
.pushsection __ex_table,"a"
+ .align 4
.word 1b, __atomic\name
.word 2b, __atomic\name
.word __atomic\name, __atomic_bad_address
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 11b6164c209..9c0ec22009a 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -12,12 +12,162 @@
* more details.
*/
+#include <linux/export.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <arch/icache.h>
+#include <arch/spr_def.h>
void __flush_icache_range(unsigned long start, unsigned long end)
{
invalidate_icache((const void *)start, end - start, PAGE_SIZE);
}
+
+
+/* Force a load instruction to issue. */
+static inline void force_load(char *p)
+{
+ *(volatile char *)p;
+}
+
+/*
+ * Flush and invalidate a VA range that is homed remotely on a single
+ * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
+ * until the memory controller holds the flushed values.
+ */
+void __attribute__((optimize("omit-frame-pointer")))
+finv_buffer_remote(void *buffer, size_t size, int hfh)
+{
+ char *p, *base;
+ size_t step_size, load_count;
+
+ /*
+ * On TILEPro the striping granularity is a fixed 8KB; on
+ * TILE-Gx it is configurable, and we rely on the fact that
+ * the hypervisor always configures maximum striping, so that
+ * bits 9 and 10 of the PA are part of the stripe function, so
+ * every 512 bytes we hit a striping boundary.
+ *
+ */
+#ifdef __tilegx__
+ const unsigned long STRIPE_WIDTH = 512;
+#else
+ const unsigned long STRIPE_WIDTH = 8192;
+#endif
+
+#ifdef __tilegx__
+ /*
+ * On TILE-Gx, we must disable the dstream prefetcher before doing
+ * a cache flush; otherwise, we could end up with data in the cache
+ * that we don't want there. Note that normally we'd do an mf
+ * after the SPR write to disabling the prefetcher, but we do one
+ * below, before any further loads, so there's no need to do it
+ * here.
+ */
+ uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
+ __insn_mtspr(SPR_DSTREAM_PF, 0);
+#endif
+
+ /*
+ * Flush and invalidate the buffer out of the local L1/L2
+ * and request the home cache to flush and invalidate as well.
+ */
+ __finv_buffer(buffer, size);
+
+ /*
+ * Wait for the home cache to acknowledge that it has processed
+ * all the flush-and-invalidate requests. This does not mean
+ * that the flushed data has reached the memory controller yet,
+ * but it does mean the home cache is processing the flushes.
+ */
+ __insn_mf();
+
+ /*
+ * Issue a load to the last cache line, which can't complete
+ * until all the previously-issued flushes to the same memory
+ * controller have also completed. If we weren't striping
+ * memory, that one load would be sufficient, but since we may
+ * be, we also need to back up to the last load issued to
+ * another memory controller, which would be the point where
+ * we crossed a "striping" boundary (the granularity of striping
+ * across memory controllers). Keep backing up and doing this
+ * until we are before the beginning of the buffer, or have
+ * hit all the controllers.
+ *
+ * If we are flushing a hash-for-home buffer, it's even worse.
+ * Each line may be homed on a different tile, and each tile
+ * may have up to four lines that are on different
+ * controllers. So as we walk backwards, we have to touch
+ * enough cache lines to satisfy these constraints. In
+ * practice this ends up being close enough to "load from
+ * every cache line on a full memory stripe on each
+ * controller" that we simply do that, to simplify the logic.
+ *
+ * On TILE-Gx the hash-for-home function is much more complex,
+ * with the upshot being we can't readily guarantee we have
+ * hit both entries in the 128-entry AMT that were hit by any
+ * load in the entire range, so we just re-load them all.
+ * With larger buffers, we may want to consider using a hypervisor
+ * trap to issue loads directly to each hash-for-home tile for
+ * each controller (doing it from Linux would trash the TLB).
+ */
+ if (hfh) {
+ step_size = L2_CACHE_BYTES;
+#ifdef __tilegx__
+ load_count = (size + L2_CACHE_BYTES - 1) / L2_CACHE_BYTES;
+#else
+ load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
+ (1 << CHIP_LOG_NUM_MSHIMS());
+#endif
+ } else {
+ step_size = STRIPE_WIDTH;
+ load_count = (1 << CHIP_LOG_NUM_MSHIMS());
+ }
+
+ /* Load the last byte of the buffer. */
+ p = (char *)buffer + size - 1;
+ force_load(p);
+
+ /* Bump down to the end of the previous stripe or cache line. */
+ p -= step_size;
+ p = (char *)((unsigned long)p | (step_size - 1));
+
+ /* Figure out how far back we need to go. */
+ base = p - (step_size * (load_count - 2));
+ if ((unsigned long)base < (unsigned long)buffer)
+ base = buffer;
+
+ /*
+ * Fire all the loads we need. The MAF only has eight entries
+ * so we can have at most eight outstanding loads, so we
+ * unroll by that amount.
+ */
+#pragma unroll 8
+ for (; p >= base; p -= step_size)
+ force_load(p);
+
+ /*
+ * Repeat, but with finv's instead of loads, to get rid of the
+ * data we just loaded into our own cache and the old home L3.
+ * No need to unroll since finv's don't target a register.
+ * The finv's are guaranteed not to actually flush the data in
+ * the buffer back to their home, since we just read it, so the
+ * lines are clean in cache; we will only invalidate those lines.
+ */
+ p = (char *)buffer + size - 1;
+ __insn_finv(p);
+ p -= step_size;
+ p = (char *)((unsigned long)p | (step_size - 1));
+ for (; p >= base; p -= step_size)
+ __insn_finv(p);
+
+ /* Wait for these finv's (and thus the first finvs) to be done. */
+ __insn_mf();
+
+#ifdef __tilegx__
+ /* Reenable the prefetcher. */
+ __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
+#endif
+}
+EXPORT_SYMBOL_GPL(finv_buffer_remote);
diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c
index e4bab5bd3f3..c3ca3e64d9d 100644
--- a/arch/tile/lib/checksum.c
+++ b/arch/tile/lib/checksum.c
@@ -16,19 +16,6 @@
#include <net/checksum.h>
#include <linux/module.h>
-static inline unsigned int longto16(unsigned long x)
-{
- unsigned long ret;
-#ifdef __tilegx__
- ret = __insn_v2sadu(x, 0);
- ret = __insn_v2sadu(ret, 0);
-#else
- ret = __insn_sadh_u(x, 0);
- ret = __insn_sadh_u(ret, 0);
-#endif
- return ret;
-}
-
__wsum do_csum(const unsigned char *buff, int len)
{
int odd, count;
@@ -94,7 +81,7 @@ __wsum do_csum(const unsigned char *buff, int len)
}
if (len & 1)
result += *buff;
- result = longto16(result);
+ result = csum_long(result);
if (odd)
result = swab16(result);
out:
diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c
index fdc403614d1..75947edccb2 100644
--- a/arch/tile/lib/cpumask.c
+++ b/arch/tile/lib/cpumask.c
@@ -16,6 +16,7 @@
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/smp.h>
+#include <linux/export.h>
/*
* Allow cropping out bits beyond the end of the array.
@@ -50,3 +51,4 @@ int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits)
} while (*bp != '\0' && *bp != '\n');
return 0;
}
+EXPORT_SYMBOL(bitmap_parselist_crop);
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
index 5801b03c13e..cdacdd11d36 100644
--- a/arch/tile/lib/delay.c
+++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
#include <linux/module.h>
#include <linux/delay.h>
#include <linux/thread_info.h>
-#include <asm/fixmap.h>
-#include <hv/hypervisor.h>
+#include <asm/timex.h>
void __udelay(unsigned long usecs)
{
- hv_nanosleep(usecs * 1000);
+ if (usecs > ULONG_MAX / 1000) {
+ WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
+ usecs = ULONG_MAX / 1000;
+ }
+ __ndelay(usecs * 1000);
}
EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
- hv_nanosleep(nsecs);
+ cycles_t target = get_cycles();
+ target += ns2cycles(nsecs);
+ while (get_cycles() < target)
+ cpu_relax();
}
EXPORT_SYMBOL(__ndelay);
-/* FIXME: should be declared in a header somewhere. */
+void __delay(unsigned long cycles)
+{
+ cycles_t target = get_cycles() + cycles;
+ while (get_cycles() < target)
+ cpu_relax();
+}
EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 1509c559765..82733c87d67 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -18,17 +18,11 @@
/* arch/tile/lib/usercopy.S */
#include <linux/uaccess.h>
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(strnlen_user_asm);
EXPORT_SYMBOL(strncpy_from_user_asm);
EXPORT_SYMBOL(clear_user_asm);
+EXPORT_SYMBOL(flush_user_asm);
+EXPORT_SYMBOL(finv_user_asm);
/* arch/tile/kernel/entry.S */
#include <linux/kernel.h>
@@ -36,6 +30,15 @@ EXPORT_SYMBOL(clear_user_asm);
EXPORT_SYMBOL(current_text_addr);
EXPORT_SYMBOL(dump_stack);
+/* arch/tile/kernel/head.S */
+EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_FUNCTION_TRACER
+/* arch/tile/kernel/mcount_64.S */
+#include <asm/ftrace.h>
+EXPORT_SYMBOL(__mcount);
+#endif /* CONFIG_FUNCTION_TRACER */
+
/* arch/tile/lib/, various memcpy files */
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__copy_to_user_inatomic);
@@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
EXPORT_SYMBOL(__copy_in_user_inatomic);
#endif
-/* arch/tile/lib/mb_incoherent.S */
-EXPORT_SYMBOL(__mb_incoherent);
-
/* hypervisor glue */
#include <hv/hypervisor.h>
EXPORT_SYMBOL(hv_dev_open);
@@ -60,6 +60,8 @@ EXPORT_SYMBOL(hv_dev_poll_cancel);
EXPORT_SYMBOL(hv_dev_close);
EXPORT_SYMBOL(hv_sysconf);
EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_get_rtc);
+EXPORT_SYMBOL(hv_set_rtc);
/* libgcc.a */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
@@ -79,10 +81,14 @@ EXPORT_SYMBOL(__umoddi3);
int64_t __moddi3(int64_t dividend, int64_t divisor);
EXPORT_SYMBOL(__moddi3);
#ifndef __tilegx__
-uint64_t __ll_mul(uint64_t n0, uint64_t n1);
-EXPORT_SYMBOL(__ll_mul);
int64_t __muldi3(int64_t, int64_t);
EXPORT_SYMBOL(__muldi3);
uint64_t __lshrdi3(uint64_t, unsigned int);
EXPORT_SYMBOL(__lshrdi3);
+uint64_t __ashrdi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashrdi3);
+uint64_t __ashldi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashldi3);
+int __ffsdi2(uint64_t);
+EXPORT_SYMBOL(__ffsdi2);
#endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
deleted file mode 100644
index 989ad7b68d5..00000000000
--- a/arch/tile/lib/mb_incoherent.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for
- * more details.
- *
- * Assembly code for invoking the HV's fence_incoherent syscall.
- */
-
-#include <linux/linkage.h>
-#include <hv/syscall_public.h>
-#include <arch/abi.h>
-#include <arch/chip.h>
-
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-
-/*
- * Invoke the hypervisor's fence_incoherent syscall, which guarantees
- * that all victims for cachelines homed on this tile have reached memory.
- */
-STD_ENTRY(__mb_incoherent)
- moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
- swint2
- jrp lr
- STD_ENDPROC(__mb_incoherent)
-
-#endif
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c
index 6235283b485..cc3d9badf03 100644
--- a/arch/tile/lib/memchr_32.c
+++ b/arch/tile/lib/memchr_32.c
@@ -18,12 +18,24 @@
void *memchr(const void *s, int c, size_t n)
{
+ const uint32_t *last_word_ptr;
+ const uint32_t *p;
+ const char *last_byte_ptr;
+ uintptr_t s_int;
+ uint32_t goal, before_mask, v, bits;
+ char *ret;
+
+ if (__builtin_expect(n == 0, 0)) {
+ /* Don't dereference any memory if the array is empty. */
+ return NULL;
+ }
+
/* Get an aligned pointer. */
- const uintptr_t s_int = (uintptr_t) s;
- const uint32_t *p = (const uint32_t *)(s_int & -4);
+ s_int = (uintptr_t) s;
+ p = (const uint32_t *)(s_int & -4);
/* Create four copies of the byte for which we are looking. */
- const uint32_t goal = 0x01010101 * (uint8_t) c;
+ goal = 0x01010101 * (uint8_t) c;
/* Read the first word, but munge it so that bytes before the array
* will not match goal.
@@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n)
* Note that this shift count expression works because we know
* shift counts are taken mod 32.
*/
- const uint32_t before_mask = (1 << (s_int << 3)) - 1;
- uint32_t v = (*p | before_mask) ^ (goal & before_mask);
+ before_mask = (1 << (s_int << 3)) - 1;
+ v = (*p | before_mask) ^ (goal & before_mask);
/* Compute the address of the last byte. */
- const char *const last_byte_ptr = (const char *)s + n - 1;
+ last_byte_ptr = (const char *)s + n - 1;
/* Compute the address of the word containing the last byte. */
- const uint32_t *const last_word_ptr =
- (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
-
- uint32_t bits;
- char *ret;
-
- if (__builtin_expect(n == 0, 0)) {
- /* Don't dereference any memory if the array is empty. */
- return NULL;
- }
+ last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
while ((bits = __insn_seqb(v, goal)) == 0) {
if (__builtin_expect(p == last_word_ptr, 0)) {
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
new file mode 100644
index 00000000000..f8196b3a950
--- /dev/null
+++ b/arch/tile/lib/memchr_64.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+void *memchr(const void *s, int c, size_t n)
+{
+ const uint64_t *last_word_ptr;
+ const uint64_t *p;
+ const char *last_byte_ptr;
+ uintptr_t s_int;
+ uint64_t goal, before_mask, v, bits;
+ char *ret;
+
+ if (__builtin_expect(n == 0, 0)) {
+ /* Don't dereference any memory if the array is empty. */
+ return NULL;
+ }
+
+ /* Get an aligned pointer. */
+ s_int = (uintptr_t) s;
+ p = (const uint64_t *)(s_int & -8);
+
+ /* Create eight copies of the byte for which we are looking. */
+ goal = copy_byte(c);
+
+ /* Read the first word, but munge it so that bytes before the array
+ * will not match goal.
+ */
+ before_mask = MASK(s_int);
+ v = (*p | before_mask) ^ (goal & before_mask);
+
+ /* Compute the address of the last byte. */
+ last_byte_ptr = (const char *)s + n - 1;
+
+ /* Compute the address of the word containing the last byte. */
+ last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
+
+ while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
+ if (__builtin_expect(p == last_word_ptr, 0)) {
+ /* We already read the last word in the array,
+ * so give up.
+ */
+ return NULL;
+ }
+ v = *++p;
+ }
+
+ /* We found a match, but it might be in a byte past the end
+ * of the array.
+ */
+ ret = ((char *)p) + (CFZ(bits) >> 3);
+ return (ret <= last_byte_ptr) ? ret : NULL;
+}
+EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 2a419a6122d..a2771ae5da5 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
#include <linux/linkage.h>
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
#define IS_MEMCPY 0
#define IS_COPY_FROM_USER 1
#define IS_COPY_FROM_USER_ZEROING 2
@@ -44,6 +36,7 @@
*/
#define EX \
.pushsection __ex_table, "a"; \
+ .align 4; \
.word 9f, memcpy_common_fixup; \
.popsection; \
9
@@ -158,12 +151,9 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
{ addi r3, r1, 60; andi r9, r9, -64 }
-#if CHIP_HAS_WH64()
/* No need to prefetch dst, we'll just do the wh64
* right before we copy a line.
*/
-#endif
-
EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
/* Intentionally stall for a few cycles to leave L2 cache alone. */
{ bnzt zero, .; move r27, lr }
@@ -171,21 +161,6 @@ EX: { lw r6, r3; addi r3, r3, 64 }
/* Intentionally stall for a few cycles to leave L2 cache alone. */
{ bnzt zero, . }
EX: { lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
- /* Prefetch the dest */
- /* Intentionally stall for a few cycles to leave L2 cache alone. */
- { bnzt zero, . }
- /* Use a real load to cause a TLB miss if necessary. We aren't using
- * r28, so this should be fine.
- */
-EX: { lw r28, r9; addi r9, r9, 64 }
- /* Intentionally stall for a few cycles to leave L2 cache alone. */
- { bnzt zero, . }
- { prefetch r9; addi r9, r9, 64 }
- /* Intentionally stall for a few cycles to leave L2 cache alone. */
- { bnzt zero, . }
- { prefetch r9; addi r9, r9, 64 }
-#endif
/* Intentionally stall for a few cycles to leave L2 cache alone. */
{ bz zero, .Lbig_loop2 }
@@ -286,13 +261,8 @@ EX: { lw r7, r3; addi r3, r3, 64 }
/* Fill second L1D line. */
EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
-#if CHIP_HAS_WH64()
/* Prepare destination line for writing. */
EX: { wh64 r9; addi r9, r9, 64 }
-#else
- /* Prefetch dest line */
- { prefetch r9; addi r9, r9, 64 }
-#endif
/* Load seven words that are L1D hits to cover wh64 L2 usage. */
/* Load the three remaining words from the last L1D line, which
@@ -330,16 +300,7 @@ EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
- /* Back up the r9 to a cache line we are already storing to
- * if it gets past the end of the dest vector. Strictly speaking,
- * we don't need to back up to the start of a cache line, but it's free
- * and tidy, so why not?
- */
-EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
/* Store second L1D line. */
EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
@@ -403,7 +364,6 @@ EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
.Ldest_is_word_aligned:
-#if CHIP_HAS_DWORD_ALIGN()
EX: { andi r8, r0, 63; lwadd_na r6, r1, 4}
{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
@@ -511,26 +471,6 @@ EX: { swadd r0, r13, 4; addi r2, r2, -32 }
/* Move r1 back to the point where it corresponds to r0. */
{ addi r1, r1, -4 }
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
- /* Compute right/left shift counts and load initial source words. */
- { andi r5, r1, -4; andi r3, r1, 3 }
-EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
- /* Load and store one word at a time, using shifts and ORs
- * to correct for the misaligned src.
- */
-.Lcopy_unaligned_src_loop:
- { shr r6, r6, r3; shl r8, r7, r4 }
-EX: { lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
- { addi r5, r5, 4; slti_u r8, r2, 8 }
- { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
- { bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
/* Fall through */
/*
@@ -614,5 +554,6 @@ memcpy_fixup_loop:
.size memcpy_common_fixup, . - memcpy_common_fixup
.section __ex_table,"a"
+ .align 4
.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
.word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
new file mode 100644
index 00000000000..4815354b8cd
--- /dev/null
+++ b/arch/tile/lib/memcpy_64.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
+
+/* Must be 8 bytes in size. */
+#define op_t uint64_t
+
+/* Threshold value for when to enter the unrolled loops. */
+#define OP_T_THRES 16
+
+#if CHIP_L2_LINE_SIZE() != 64
+#error "Assumes 64 byte line size"
+#endif
+
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 4
+
+/*
+ * Provide "base versions" of load and store for the normal code path.
+ * The kernel provides other versions for userspace copies.
+ */
+#define ST(p, v) (*(p) = (v))
+#define LD(p) (*(p))
+
+#ifndef USERCOPY_FUNC
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#define RETVAL dstv
+void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#else
+/*
+ * Special kernel version will provide implementation of the LDn/STn
+ * macros to return a count of uncopied bytes due to mm fault.
+ */
+#define RETVAL 0
+int __attribute__((optimize("omit-frame-pointer")))
+USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#endif
+{
+ char *__restrict dst1 = (char *)dstv;
+ const char *__restrict src1 = (const char *)srcv;
+ const char *__restrict src1_end;
+ const char *__restrict prefetch;
+ op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+ op_t final; /* Final bytes to write to trailing word, if any */
+ long i;
+
+ if (n < 16) {
+ for (; n; n--)
+ ST1(dst1++, LD1(src1++));
+ return RETVAL;
+ }
+
+ /*
+ * Locate the end of source memory we will copy. Don't
+ * prefetch past this.
+ */
+ src1_end = src1 + n - 1;
+
+ /* Prefetch ahead a few cache lines, but not past the end. */
+ prefetch = src1;
+ for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
+ __insn_prefetch(prefetch);
+ prefetch += CHIP_L2_LINE_SIZE();
+ prefetch = (prefetch < src1_end) ? prefetch : src1;
+ }
+
+ /* Copy bytes until dst is word-aligned. */
+ for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
+ ST1(dst1++, LD1(src1++));
+
+ /* 8-byte pointer to destination memory. */
+ dst8 = (op_t *)dst1;
+
+ if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
+ /* Unaligned copy. */
+
+ op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3;
+ const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
+ -sizeof(op_t));
+ const void *srci = (void *)src1;
+ int m;
+
+ m = (CHIP_L2_LINE_SIZE() << 2) -
+ (((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
+ m = (n < m) ? n : m;
+ m /= sizeof(op_t);
+
+ /* Copy until 'dst' is cache-line-aligned. */
+ n -= (sizeof(op_t) * m);
+
+ switch (m % 4) {
+ case 0:
+ if (__builtin_expect(!m, 0))
+ goto _M0;
+ tmp1 = LD8(src8++);
+ tmp2 = LD8(src8++);
+ goto _8B3;
+ case 2:
+ m += 2;
+ tmp3 = LD8(src8++);
+ tmp0 = LD8(src8++);
+ goto _8B1;
+ case 3:
+ m += 1;
+ tmp2 = LD8(src8++);
+ tmp3 = LD8(src8++);
+ goto _8B2;
+ case 1:
+ m--;
+ tmp0 = LD8(src8++);
+ tmp1 = LD8(src8++);
+ if (__builtin_expect(!m, 0))
+ goto _8B0;
+ }
+
+ do {
+ tmp2 = LD8(src8++);
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ ST8(dst8++, tmp0);
+_8B3:
+ tmp3 = LD8(src8++);
+ tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+ ST8(dst8++, tmp1);
+_8B2:
+ tmp0 = LD8(src8++);
+ tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+ ST8(dst8++, tmp2);
+_8B1:
+ tmp1 = LD8(src8++);
+ tmp3 = __insn_dblalign(tmp3, tmp0, srci);
+ ST8(dst8++, tmp3);
+ m -= 4;
+ } while (m);
+
+_8B0:
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ ST8(dst8++, tmp0);
+ src8--;
+
+_M0:
+ if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
+ op_t tmp4, tmp5, tmp6, tmp7, tmp8;
+
+ prefetch = ((const char *)src8) +
+ CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
+
+ for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
+ n -= CHIP_L2_LINE_SIZE()) {
+ /* Prefetch and advance to next line to
+ prefetch, but don't go past the end. */
+ __insn_prefetch(prefetch);
+
+ /* Make sure prefetch got scheduled
+ earlier. */
+ __asm__ ("" : : : "memory");
+
+ prefetch += CHIP_L2_LINE_SIZE();
+ prefetch = (prefetch < src1_end) ? prefetch :
+ (const char *) src8;
+
+ tmp1 = LD8(src8++);
+ tmp2 = LD8(src8++);
+ tmp3 = LD8(src8++);
+ tmp4 = LD8(src8++);
+ tmp5 = LD8(src8++);
+ tmp6 = LD8(src8++);
+ tmp7 = LD8(src8++);
+ tmp8 = LD8(src8++);
+
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+ tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+ tmp3 = __insn_dblalign(tmp3, tmp4, srci);
+ tmp4 = __insn_dblalign(tmp4, tmp5, srci);
+ tmp5 = __insn_dblalign(tmp5, tmp6, srci);
+ tmp6 = __insn_dblalign(tmp6, tmp7, srci);
+ tmp7 = __insn_dblalign(tmp7, tmp8, srci);
+
+ __insn_wh64(dst8);
+
+ ST8(dst8++, tmp0);
+ ST8(dst8++, tmp1);
+ ST8(dst8++, tmp2);
+ ST8(dst8++, tmp3);
+ ST8(dst8++, tmp4);
+ ST8(dst8++, tmp5);
+ ST8(dst8++, tmp6);
+ ST8(dst8++, tmp7);
+
+ tmp0 = tmp8;
+ }
+ src8--;
+ }
+
+ /* Copy the rest 8-byte chunks. */
+ if (n >= sizeof(op_t)) {
+ tmp0 = LD8(src8++);
+ for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
+ tmp1 = LD8(src8++);
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ ST8(dst8++, tmp0);
+ tmp0 = tmp1;
+ }
+ src8--;
+ }
+
+ if (n == 0)
+ return RETVAL;
+
+ tmp0 = LD8(src8++);
+ tmp1 = ((const char *)src8 <= src1_end)
+ ? LD8((op_t *)src8) : 0;
+ final = __insn_dblalign(tmp0, tmp1, srci);
+
+ } else {
+ /* Aligned copy. */
+
+ const op_t *__restrict src8 = (const op_t *)src1;
+
+ /* src8 and dst8 are both word-aligned. */
+ if (n >= CHIP_L2_LINE_SIZE()) {
+ /* Copy until 'dst' is cache-line-aligned. */
+ for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
+ n -= sizeof(op_t))
+ ST8(dst8++, LD8(src8++));
+
+ for (; n >= CHIP_L2_LINE_SIZE(); ) {
+ op_t tmp0, tmp1, tmp2, tmp3;
+ op_t tmp4, tmp5, tmp6, tmp7;
+
+ /*
+ * Prefetch and advance to next line
+ * to prefetch, but don't go past the
+ * end.
+ */
+ __insn_prefetch(prefetch);
+
+ /* Make sure prefetch got scheduled
+ earlier. */
+ __asm__ ("" : : : "memory");
+
+ prefetch += CHIP_L2_LINE_SIZE();
+ prefetch = (prefetch < src1_end) ? prefetch :
+ (const char *)src8;
+
+ /*
+ * Do all the loads before wh64. This
+ * is necessary if [src8, src8+7] and
+ * [dst8, dst8+7] share the same cache
+ * line and dst8 <= src8, as can be
+ * the case when called from memmove,
+ * or with code tested on x86 whose
+ * memcpy always works with forward
+ * copies.
+ */
+ tmp0 = LD8(src8++);
+ tmp1 = LD8(src8++);
+ tmp2 = LD8(src8++);
+ tmp3 = LD8(src8++);
+ tmp4 = LD8(src8++);
+ tmp5 = LD8(src8++);
+ tmp6 = LD8(src8++);
+ tmp7 = LD8(src8++);
+
+ /* wh64 and wait for tmp7 load completion. */
+ __asm__ ("move %0, %0; wh64 %1\n"
+ : : "r"(tmp7), "r"(dst8));
+
+ ST8(dst8++, tmp0);
+ ST8(dst8++, tmp1);
+ ST8(dst8++, tmp2);
+ ST8(dst8++, tmp3);
+ ST8(dst8++, tmp4);
+ ST8(dst8++, tmp5);
+ ST8(dst8++, tmp6);
+ ST8(dst8++, tmp7);
+
+ n -= CHIP_L2_LINE_SIZE();
+ }
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
+ }
+
+ for (; n >= sizeof(op_t); n -= sizeof(op_t))
+ ST8(dst8++, LD8(src8++));
+
+ if (__builtin_expect(n == 0, 1))
+ return RETVAL;
+
+ final = LD8(src8);
+ }
+
+ /* n != 0 if we get here. Write out any trailing bytes. */
+ dst1 = (char *)dst8;
+#ifndef __BIG_ENDIAN__
+ if (n & 4) {
+ ST4((uint32_t *)dst1, final);
+ dst1 += 4;
+ final >>= 32;
+ n &= 3;
+ }
+ if (n & 2) {
+ ST2((uint16_t *)dst1, final);
+ dst1 += 2;
+ final >>= 16;
+ n &= 1;
+ }
+ if (n)
+ ST1((uint8_t *)dst1, final);
+#else
+ if (n & 4) {
+ ST4((uint32_t *)dst1, final >> 32);
+ dst1 += 4;
+ }
+ else
+ {
+ final >>= 32;
+ }
+ if (n & 2) {
+ ST2((uint16_t *)dst1, final >> 16);
+ dst1 += 2;
+ }
+ else
+ {
+ final >>= 16;
+ }
+ if (n & 1)
+ ST1((uint8_t *)dst1, final >> 8);
+#endif
+
+ return RETVAL;
+}
+
+#ifdef USERCOPY_FUNC
+#undef ST1
+#undef ST2
+#undef ST4
+#undef ST8
+#undef LD1
+#undef LD2
+#undef LD4
+#undef LD8
+#undef USERCOPY_FUNC
+#endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index f7d4a6ad61e..00000000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for
- * more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
- void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
- void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
- void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
- __insn_mtspr(SPR_SIM_CONTROL, \
- SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache. (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
- pte_t dst_pte, pte_t src_pte, int len)
-{
- int idx;
- unsigned long flags, newsrc, newdst;
- pmd_t *pmdp;
- pte_t *ptep;
- int type0, type1;
- int cpu = get_cpu();
-
- /*
- * Disable interrupts so that we don't recurse into memcpy()
- * in an interrupt handler, nor accidentally reference
- * the PA of the source from an interrupt routine. Also
- * notify the simulator that we're playing games so we don't
- * generate spurious coherency warnings.
- */
- local_irq_save(flags);
- sim_allow_multiple_caching(1);
-
- /* Set up the new dest mapping */
- type0 = kmap_atomic_idx_push();
- idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
- newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
- pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
- ptep = pte_offset_kernel(pmdp, newdst);
- if (pte_val(*ptep) != pte_val(dst_pte)) {
- set_pte(ptep, dst_pte);
- local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
- }
-
- /* Set up the new source mapping */
- type1 = kmap_atomic_idx_push();
- idx += (type0 - type1);
- src_pte = hv_pte_set_nc(src_pte);
- src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */
- newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
- pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
- ptep = pte_offset_kernel(pmdp, newsrc);
- *ptep = src_pte; /* set_pte() would be confused by this */
- local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
- /* Actually move the data. */
- __memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
- /*
- * Remap the source as locally-cached and not OLOC'ed so that
- * we can inval without also invaling the remote cpu's cache.
- * This also avoids known errata with inv'ing cacheable oloc data.
- */
- src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
- src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
- *ptep = src_pte; /* set_pte() would be confused by this */
- local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
- /*
- * Do the actual invalidation, covering the full L2 cache line
- * at the end since __memcpy_asm() is somewhat aggressive.
- */
- __inv_buffer((void *)newsrc, len);
-
- /*
- * We're done: notify the simulator that all is back to normal,
- * and re-enable interrupts and pre-emption.
- */
- kmap_atomic_idx_pop();
- kmap_atomic_idx_pop();
- sim_allow_multiple_caching(0);
- local_irq_restore(flags);
- put_cpu();
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
- memcpy_t func)
-{
- /*
- * Check if it's big enough to bother with. We may end up doing a
- * small copy via TLB manipulation if we're near a page boundary,
- * but presumably we'll make it up when we hit the second page.
- */
- while (len >= LARGE_COPY_CUTOFF) {
- int copy_size, bytes_left_on_page;
- pte_t *src_ptep, *dst_ptep;
- pte_t src_pte, dst_pte;
- struct page *src_page, *dst_page;
-
- /* Is the source page oloc'ed to a remote cpu? */
-retry_source:
- src_ptep = virt_to_pte(current->mm, (unsigned long)source);
- if (src_ptep == NULL)
- break;
- src_pte = *src_ptep;
- if (!hv_pte_get_present(src_pte) ||
- !hv_pte_get_readable(src_pte) ||
- hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
- break;
- if (get_remote_cache_cpu(src_pte) == smp_processor_id())
- break;
- src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
- get_page(src_page);
- if (pte_val(src_pte) != pte_val(*src_ptep)) {
- put_page(src_page);
- goto retry_source;
- }
- if (pte_huge(src_pte)) {
- /* Adjust the PTE to correspond to a small page */
- int pfn = hv_pte_get_pfn(src_pte);
- pfn += (((unsigned long)source & (HPAGE_SIZE-1))
- >> PAGE_SHIFT);
- src_pte = pfn_pte(pfn, src_pte);
- src_pte = pte_mksmall(src_pte);
- }
-
- /* Is the destination page writable? */
-retry_dest:
- dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
- if (dst_ptep == NULL) {
- put_page(src_page);
- break;
- }
- dst_pte = *dst_ptep;
- if (!hv_pte_get_present(dst_pte) ||
- !hv_pte_get_writable(dst_pte)) {
- put_page(src_page);
- break;
- }
- dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
- if (dst_page == src_page) {
- /*
- * Source and dest are on the same page; this
- * potentially exposes us to incoherence if any
- * part of src and dest overlap on a cache line.
- * Just give up rather than trying to be precise.
- */
- put_page(src_page);
- break;
- }
- get_page(dst_page);
- if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
- put_page(dst_page);
- goto retry_dest;
- }
- if (pte_huge(dst_pte)) {
- /* Adjust the PTE to correspond to a small page */
- int pfn = hv_pte_get_pfn(dst_pte);
- pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
- >> PAGE_SHIFT);
- dst_pte = pfn_pte(pfn, dst_pte);
- dst_pte = pte_mksmall(dst_pte);
- }
-
- /* All looks good: create a cachable PTE and copy from it */
- copy_size = len;
- bytes_left_on_page =
- PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
- if (copy_size > bytes_left_on_page)
- copy_size = bytes_left_on_page;
- bytes_left_on_page =
- PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
- if (copy_size > bytes_left_on_page)
- copy_size = bytes_left_on_page;
- memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
- /* Release the pages */
- put_page(dst_page);
- put_page(src_page);
-
- /* Continue on the next page */
- dest += copy_size;
- source += copy_size;
- len -= copy_size;
- }
-
- return func(dest, source, len);
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return (void *)__memcpy_asm(to, from, n);
- else
- return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
- unsigned long n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return __copy_to_user_inatomic_asm(to, from, n);
- else
- return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
- unsigned long n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return __copy_from_user_inatomic_asm(to, from, n);
- else
- return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
- unsigned long n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return __copy_from_user_zeroing_asm(to, from, n);
- else
- return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
new file mode 100644
index 00000000000..88c7016492c
--- /dev/null
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ *
+ * Do memcpy(), but trap and return "n" when a load or store faults.
+ *
+ * Note: this idiom only works when memcpy() compiles to a leaf function.
+ * Here leaf function not only means it does not have calls, but also
+ * requires no stack operations (sp, stack frame pointer) and no
+ * use of callee-saved registers, else "jrp lr" will be incorrect since
+ * unwinding stack frame is bypassed. Since memcpy() is not complex so
+ * these conditions are satisfied here, but we need to be careful when
+ * modifying this file. This is not a clean solution but is the best
+ * one so far.
+ *
+ * Also note that we are capturing "n" from the containing scope here.
+ */
+
+#define _ST(p, inst, v) \
+ ({ \
+ asm("1: " #inst " %0, %1;" \
+ ".pushsection .coldtext.memcpy,\"ax\";" \
+ "2: { move r0, %2; jrp lr };" \
+ ".section __ex_table,\"a\";" \
+ ".align 8;" \
+ ".quad 1b, 2b;" \
+ ".popsection" \
+ : "=m" (*(p)) : "r" (v), "r" (n)); \
+ })
+
+#define _LD(p, inst) \
+ ({ \
+ unsigned long __v; \
+ asm("1: " #inst " %0, %1;" \
+ ".pushsection .coldtext.memcpy,\"ax\";" \
+ "2: { move r0, %2; jrp lr };" \
+ ".section __ex_table,\"a\";" \
+ ".align 8;" \
+ ".quad 1b, 2b;" \
+ ".popsection" \
+ : "=r" (__v) : "m" (*(p)), "r" (n)); \
+ __v; \
+ })
+
+#define USERCOPY_FUNC __copy_to_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_from_user_inatomic
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_in_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
+ unsigned long n)
+{
+ unsigned long rc = __copy_from_user_inatomic(to, from, n);
+ if (unlikely(rc))
+ memset(to + n - rc, 0, rc);
+ return rc;
+}
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff..2042bfe6595 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
* more details.
*/
-#include <arch/chip.h>
-
#include <linux/types.h>
#include <linux/string.h>
#include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
void *memset(void *s, int c, size_t n)
{
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
int n32;
uint32_t v16, v32;
uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
- int ahead32;
-#else
int to_align32;
-#endif
/* Experimentation shows that a trivial tight loop is a win up until
* around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
return s;
}
-#if !CHIP_HAS_WH64()
- /* Use a spare issue slot to start prefetching the first cache
- * line early. This instruction is free as the store can be buried
- * in otherwise idle issue slots doing ALU ops.
- */
- __insn_prefetch(out8);
-
- /* We prefetch the end so that a short memset that spans two cache
- * lines gets some prefetching benefit. Again we believe this is free
- * to issue.
- */
- __insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
/* Align 'out8'. We know n >= 3 so this won't write past the end. */
while (((uintptr_t) out8 & 3) != 0) {
*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
/* This must be at least 8 or the following loop doesn't work. */
#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
-#if !CHIP_HAS_WH64()
-
- ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
- /* We already prefetched the first and last cache lines, so
- * we only need to do more prefetching if we are storing
- * to more than two cache lines.
- */
- if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
- int i;
-
- /* Prefetch the next several cache lines.
- * This is the setup code for the software-pipelined
- * loop below.
- */
-#define MAX_PREFETCH 5
- ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
- if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
- ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
- for (i = CACHE_LINE_SIZE_IN_WORDS;
- i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
- __insn_prefetch(&out32[i]);
- }
-
- if (n32 > ahead32) {
- while (1) {
- int j;
-
- /* Prefetch by reading one word several cache lines
- * ahead. Since loads are non-blocking this will
- * cause the full cache line to be read while we are
- * finishing earlier cache lines. Using a store
- * here causes microarchitectural performance
- * problems where a victimizing store miss goes to
- * the head of the retry FIFO and locks the pipe for
- * a few cycles. So a few subsequent stores in this
- * loop go into the retry FIFO, and then later
- * stores see other stores to the same cache line
- * are already in the retry FIFO and themselves go
- * into the retry FIFO, filling it up and grinding
- * to a halt waiting for the original miss to be
- * satisfied.
- */
- __insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
- n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
- /* Save icache space by only partially unrolling
- * this loop.
- */
- for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
- *out32++ = v32;
- *out32++ = v32;
- *out32++ = v32;
- *out32++ = v32;
- }
-
- /* To save compiled code size, reuse this loop even
- * when we run out of prefetching to do by dropping
- * ahead32 down.
- */
- if (n32 <= ahead32) {
- /* Not even a full cache line left,
- * so stop now.
- */
- if (n32 < CACHE_LINE_SIZE_IN_WORDS)
- break;
-
- /* Choose a small enough value that we don't
- * prefetch past the end. There's no sense
- * in touching cache lines we don't have to.
- */
- ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
- }
- }
- }
-
-#else /* CHIP_HAS_WH64() */
-
/* Determine how many words we need to emit before the 'out32'
* pointer becomes aligned modulo the cache line size.
*/
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
}
-#endif /* CHIP_HAS_WH64() */
-
/* Now handle any leftover values. */
if (n32 != 0) {
do {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
new file mode 100644
index 00000000000..03ef69cd73d
--- /dev/null
+++ b/arch/tile/lib/memset_64.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <arch/chip.h>
+#include "string-endian.h"
+
+void *memset(void *s, int c, size_t n)
+{
+ uint64_t *out64;
+ int n64, to_align64;
+ uint64_t v64;
+ uint8_t *out8 = s;
+
+ /* Experimentation shows that a trivial tight loop is a win up until
+ * around a size of 20, where writing a word at a time starts to win.
+ */
+#define BYTE_CUTOFF 20
+
+#if BYTE_CUTOFF < 7
+ /* This must be at least at least this big, or some code later
+ * on doesn't work.
+ */
+#error "BYTE_CUTOFF is too small"
+#endif
+
+ if (n < BYTE_CUTOFF) {
+ /* Strangely, this turns out to be the tightest way to
+ * write this loop.
+ */
+ if (n != 0) {
+ do {
+ /* Strangely, combining these into one line
+ * performs worse.
+ */
+ *out8 = c;
+ out8++;
+ } while (--n != 0);
+ }
+
+ return s;
+ }
+
+ /* Align 'out8'. We know n >= 7 so this won't write past the end. */
+ while (((uintptr_t) out8 & 7) != 0) {
+ *out8++ = c;
+ --n;
+ }
+
+ /* Align 'n'. */
+ while (n & 7)
+ out8[--n] = c;
+
+ out64 = (uint64_t *) out8;
+ n64 = n >> 3;
+
+ /* Tile input byte out to 64 bits. */
+ v64 = copy_byte(c);
+
+ /* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
+
+ /* Determine how many words we need to emit before the 'out32'
+ * pointer becomes aligned modulo the cache line size.
+ */
+ to_align64 = (-((uintptr_t)out64 >> 3)) &
+ (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
+
+ /* Only bother aligning and using wh64 if there is at least
+ * one full cache line to process. This check also prevents
+ * overrunning the end of the buffer with alignment words.
+ */
+ if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
+ int lines_left;
+
+ /* Align out64 mod the cache line size so we can use wh64. */
+ n64 -= to_align64;
+ for (; to_align64 != 0; to_align64--) {
+ *out64 = v64;
+ out64++;
+ }
+
+ /* Use unsigned divide to turn this into a right shift. */
+ lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+
+ do {
+ /* Only wh64 a few lines at a time, so we don't
+ * exceed the maximum number of victim lines.
+ */
+ int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+ ? lines_left
+ : CHIP_MAX_OUTSTANDING_VICTIMS());
+ uint64_t *wh = out64;
+ int i = x;
+ int j;
+
+ lines_left -= x;
+
+ do {
+ __insn_wh64(wh);
+ wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+ } while (--i);
+
+ for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
+ j != 0; j--) {
+ *out64++ = v64;
+ *out64++ = v64;
+ *out64++ = v64;
+ *out64++ = v64;
+ }
+ } while (lines_left != 0);
+
+ /* We processed all full lines above, so only this many
+ * words remain to be processed.
+ */
+ n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
+ }
+
+ /* Now handle any leftover values. */
+ if (n64 != 0) {
+ do {
+ *out64 = v64;
+ out64++;
+ } while (--n64 != 0);
+ }
+
+ return s;
+}
+EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 485e24d62c6..b34f79aada4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
#include <linux/spinlock.h>
#include <linux/module.h>
#include <asm/processor.h>
+#include <arch/spr_def.h>
#include "spinlock_common.h"
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)
-/* Lock the word, spinning until there are no tns-ers. */
-static inline u32 get_rwlock(arch_rwlock_t *rwlock)
-{
- u32 iterations = 0;
- for (;;) {
- u32 val = __insn_tns((int *)&rwlock->lock);
- if (unlikely(val & 1)) {
- delay_backoff(iterations++);
- continue;
- }
- return val;
- }
-}
-
-int arch_read_trylock_slow(arch_rwlock_t *rwlock)
-{
- u32 val = get_rwlock(rwlock);
- int locked = (val << RD_COUNT_WIDTH) == 0;
- rwlock->lock = val + (locked << RD_COUNT_SHIFT);
- return locked;
-}
-EXPORT_SYMBOL(arch_read_trylock_slow);
-
-void arch_read_unlock_slow(arch_rwlock_t *rwlock)
-{
- u32 val = get_rwlock(rwlock);
- rwlock->lock = val - (1 << RD_COUNT_SHIFT);
-}
-EXPORT_SYMBOL(arch_read_unlock_slow);
-
-void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We can get the read lock if everything but the reader bits (which
+ * are in the high part of the word) is zero, i.e. no active or
+ * waiting writers, no tns.
+ *
+ * We guard the tns/store-back with an interrupt critical section to
+ * preserve the semantic that the same read lock can be acquired in an
+ * interrupt context.
+ */
+int arch_read_trylock(arch_rwlock_t *rwlock)
{
- u32 eq, mask = 1 << WR_CURR_SHIFT;
- while (unlikely(val & 1)) {
- /* Limited backoff since we are the highest-priority task. */
- relax(4);
- val = __insn_tns((int *)&rwlock->lock);
+ u32 val;
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+ val = __insn_tns((int *)&rwlock->lock);
+ if (likely((val << _RD_COUNT_WIDTH) == 0)) {
+ val += 1 << RD_COUNT_SHIFT;
+ rwlock->lock = val;
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ BUG_ON(val == 0); /* we don't expect wraparound */
+ return 1;
}
- val = __insn_addb(val, mask);
- eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
- val = __insn_mz(eq & mask, val);
- rwlock->lock = val;
+ if ((val & 1) == 0)
+ rwlock->lock = val;
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ return 0;
}
-EXPORT_SYMBOL(arch_write_unlock_slow);
+EXPORT_SYMBOL(arch_read_trylock);
/*
- * We spin until everything but the reader bits (which are in the high
- * part of the word) are zero, i.e. no active or waiting writers, no tns.
- *
+ * Spin doing arch_read_trylock() until we acquire the lock.
* ISSUE: This approach can permanently starve readers. A reader who sees
* a writer could instead take a ticket lock (just like a writer would),
* and atomically enter read mode (with 1 reader) when it gets the ticket.
- * This way both readers and writers will always make forward progress
+ * This way both readers and writers would always make forward progress
* in a finite time.
*/
-void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
+void arch_read_lock(arch_rwlock_t *rwlock)
{
u32 iterations = 0;
- do {
- if (!(val & 1))
- rwlock->lock = val;
+ while (unlikely(!arch_read_trylock(rwlock)))
delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_read_lock);
+
+void arch_read_unlock(arch_rwlock_t *rwlock)
+{
+ u32 val, iterations = 0;
+
+ mb(); /* guarantee anything modified under the lock is visible */
+ for (;;) {
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
val = __insn_tns((int *)&rwlock->lock);
- } while ((val << RD_COUNT_WIDTH) != 0);
- rwlock->lock = val + (1 << RD_COUNT_SHIFT);
+ if (likely((val & 1) == 0)) {
+ rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ break;
+ }
+ __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ delay_backoff(iterations++);
+ }
}
-EXPORT_SYMBOL(arch_read_lock_slow);
+EXPORT_SYMBOL(arch_read_unlock);
-void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We don't need an interrupt critical section here (unlike for
+ * arch_read_lock) since we should never use a bare write lock where
+ * it could be interrupted by code that could try to re-acquire it.
+ */
+void arch_write_lock(arch_rwlock_t *rwlock)
{
/*
* The trailing underscore on this variable (and curr_ below)
@@ -167,23 +168,36 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
* when we compare them.
*/
u32 my_ticket_;
+ u32 iterations = 0;
+ u32 val = __insn_tns((int *)&rwlock->lock);
- /* Take out the next ticket; this will also stop would-be readers. */
- if (val & 1)
- val = get_rwlock(rwlock);
- rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
-
- /* Extract my ticket value from the original word. */
- my_ticket_ = val >> WR_NEXT_SHIFT;
+ if (likely(val == 0)) {
+ rwlock->lock = 1 << _WR_NEXT_SHIFT;
+ return;
+ }
/*
- * Wait until the "current" field matches our ticket, and
- * there are no remaining readers.
+ * Wait until there are no readers, then bump up the next
+ * field and capture the ticket value.
*/
for (;;) {
+ if (!(val & 1)) {
+ if ((val >> RD_COUNT_SHIFT) == 0)
+ break;
+ rwlock->lock = val;
+ }
+ delay_backoff(iterations++);
+ val = __insn_tns((int *)&rwlock->lock);
+ }
+
+ /* Take out the next ticket and extract my ticket value. */
+ rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
+ my_ticket_ = val >> WR_NEXT_SHIFT;
+
+ /* Wait until the "current" field matches our ticket. */
+ for (;;) {
u32 curr_ = val >> WR_CURR_SHIFT;
- u32 readers = val >> RD_COUNT_SHIFT;
- u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers;
+ u32 delta = ((my_ticket_ - curr_) & WR_MASK);
if (likely(delta == 0))
break;
@@ -199,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
relax(4);
}
}
-EXPORT_SYMBOL(arch_write_lock_slow);
+EXPORT_SYMBOL(arch_write_lock);
-int __tns_atomic_acquire(atomic_t *lock)
+int arch_write_trylock(arch_rwlock_t *rwlock)
{
- int ret;
- u32 iterations = 0;
+ u32 val = __insn_tns((int *)&rwlock->lock);
- BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
- __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+ /*
+ * If a tns is in progress, or there's a waiting or active locker,
+ * or active readers, we can't take the lock, so give up.
+ */
+ if (unlikely(val != 0)) {
+ if (!(val & 1))
+ rwlock->lock = val;
+ return 0;
+ }
- while ((ret = __insn_tns((void *)&lock->counter)) == 1)
- delay_backoff(iterations++);
- return ret;
+ /* Set the "next" field to mark it locked. */
+ rwlock->lock = 1 << _WR_NEXT_SHIFT;
+ return 1;
}
+EXPORT_SYMBOL(arch_write_trylock);
-void __tns_atomic_release(atomic_t *p, int v)
+void arch_write_unlock(arch_rwlock_t *rwlock)
{
- p->counter = v;
- __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+ u32 val, eq, mask;
+
+ mb(); /* guarantee anything modified under the lock is visible */
+ val = __insn_tns((int *)&rwlock->lock);
+ if (likely(val == (1 << _WR_NEXT_SHIFT))) {
+ rwlock->lock = 0;
+ return;
+ }
+ while (unlikely(val & 1)) {
+ /* Limited backoff since we are the highest-priority task. */
+ relax(4);
+ val = __insn_tns((int *)&rwlock->lock);
+ }
+ mask = 1 << WR_CURR_SHIFT;
+ val = __insn_addb(val, mask);
+ eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+ val = __insn_mz(eq & mask, val);
+ rwlock->lock = val;
}
+EXPORT_SYMBOL(arch_write_unlock);
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
new file mode 100644
index 00000000000..d6fb9581e98
--- /dev/null
+++ b/arch/tile/lib/spinlock_64.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+
+#include "spinlock_common.h"
+
+/*
+ * Read the spinlock value without allocating in our cache and without
+ * causing an invalidation to another cpu with a copy of the cacheline.
+ * This is important when we are spinning waiting for the lock.
+ */
+static inline u32 arch_spin_read_noalloc(void *lock)
+{
+ return atomic_cmpxchg((atomic_t *)lock, -1, -1);
+}
+
+/*
+ * Wait until the high bits (current) match my ticket.
+ * If we notice the overflow bit set on entry, we clear it.
+ */
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
+{
+ if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
+ __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
+ my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
+ }
+
+ for (;;) {
+ u32 val = arch_spin_read_noalloc(lock);
+ u32 delta = my_ticket - arch_spin_current(val);
+ if (delta == 0)
+ return;
+ relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
+ }
+}
+EXPORT_SYMBOL(arch_spin_lock_slow);
+
+/*
+ * Check the lock to see if it is plausible, and try to get it with cmpxchg().
+ */
+int arch_spin_trylock(arch_spinlock_t *lock)
+{
+ u32 val = arch_spin_read_noalloc(lock);
+ if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
+ return 0;
+ return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
+ == val;
+}
+EXPORT_SYMBOL(arch_spin_trylock);
+
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ u32 iterations = 0;
+ while (arch_spin_is_locked(lock))
+ delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_spin_unlock_wait);
+
+/*
+ * If the read lock fails due to a writer, we retry periodically
+ * until the value is positive and we write our incremented reader count.
+ */
+void __read_lock_failed(arch_rwlock_t *rw)
+{
+ u32 val;
+ int iterations = 0;
+ do {
+ delay_backoff(iterations++);
+ val = __insn_fetchaddgez4(&rw->lock, 1);
+ } while (unlikely(arch_write_val_locked(val)));
+}
+EXPORT_SYMBOL(__read_lock_failed);
+
+/*
+ * If we failed because there were readers, clear the "writer" bit
+ * so we don't block additional readers. Otherwise, there was another
+ * writer anyway, so our "fetchor" made no difference. Then wait,
+ * issuing periodic fetchor instructions, till we get the lock.
+ */
+void __write_lock_failed(arch_rwlock_t *rw, u32 val)
+{
+ int iterations = 0;
+ do {
+ if (!arch_write_val_locked(val))
+ val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+ delay_backoff(iterations++);
+ val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+ } while (val != 0);
+}
+EXPORT_SYMBOL(__write_lock_failed);
diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h
index c1010980913..6ac37509fac 100644
--- a/arch/tile/lib/spinlock_common.h
+++ b/arch/tile/lib/spinlock_common.h
@@ -60,5 +60,5 @@ static void delay_backoff(int iterations)
loops += __insn_crc32_32(stack_pointer, get_cycles_low()) &
(loops - 1);
- relax(1 << exponent);
+ relax(loops);
}
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
index c94e6f7ae7b..841fe696301 100644
--- a/arch/tile/lib/strchr_32.c
+++ b/arch/tile/lib/strchr_32.c
@@ -16,8 +16,6 @@
#include <linux/string.h>
#include <linux/module.h>
-#undef strchr
-
char *strchr(const char *s, int c)
{
int z, g;
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
new file mode 100644
index 00000000000..fe6e31c06f8
--- /dev/null
+++ b/arch/tile/lib/strchr_64.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+char *strchr(const char *s, int c)
+{
+ int z, g;
+
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+ /* Create eight copies of the byte for which we are looking. */
+ const uint64_t goal = copy_byte(c);
+
+ /* Read the first aligned word, but force bytes before the string to
+ * match neither zero nor goal (we make sure the high bit of each
+ * byte is 1, and the low 7 bits are all the opposite of the goal
+ * byte).
+ */
+ const uint64_t before_mask = MASK(s_int);
+ uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1));
+
+ uint64_t zero_matches, goal_matches;
+ while (1) {
+ /* Look for a terminating '\0'. */
+ zero_matches = __insn_v1cmpeqi(v, 0);
+
+ /* Look for the goal byte. */
+ goal_matches = __insn_v1cmpeq(v, goal);
+
+ if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
+ break;
+
+ v = *++p;
+ }
+
+ z = CFZ(zero_matches);
+ g = CFZ(goal_matches);
+
+ /* If we found c before '\0' we got a match. Note that if c == '\0'
+ * then g == z, and we correctly return the address of the '\0'
+ * rather than NULL.
+ */
+ return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
+}
+EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h
new file mode 100644
index 00000000000..2e49cbfe937
--- /dev/null
+++ b/arch/tile/lib/string-endian.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ *
+ * Provide a mask based on the pointer alignment that
+ * sets up non-zero bytes before the beginning of the string.
+ * The MASK expression works because shift counts are taken mod 64.
+ * Also, specify how to count "first" and "last" bits
+ * when the bits have been read as a word.
+ */
+
+#include <asm/byteorder.h>
+
+#ifdef __LITTLE_ENDIAN
+#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1)
+#define NULMASK(x) ((2ULL << x) - 1)
+#define CFZ(x) __insn_ctz(x)
+#define REVCZ(x) __insn_clz(x)
+#else
+#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1)))
+#define NULMASK(x) (-2LL << (63 - x))
+#define CFZ(x) __insn_clz(x)
+#define REVCZ(x) __insn_ctz(x)
+#endif
+
+/*
+ * Create eight copies of the byte in a uint64_t. Byte Shuffle uses
+ * the bytes of srcB as the index into the dest vector to select a
+ * byte. With all indices of zero, the first byte is copied into all
+ * the other bytes.
+ */
+static inline uint64_t copy_byte(uint8_t byte)
+{
+ return __insn_shufflebytes(byte, 0, 0);
+}
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index 4974292a553..f26f88e11e4 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,8 +16,6 @@
#include <linux/string.h>
#include <linux/module.h>
-#undef strlen
-
size_t strlen(const char *s)
{
/* Get an aligned pointer. */
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
new file mode 100644
index 00000000000..9583fc3361f
--- /dev/null
+++ b/arch/tile/lib/strlen_64.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strlen(const char *s)
+{
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+ /* Read and MASK the first word. */
+ uint64_t v = *p | MASK(s_int);
+
+ uint64_t bits;
+ while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
+ v = *++p;
+
+ return ((const char *)p) + (CFZ(bits) >> 3) - s;
+}
+EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c
new file mode 100644
index 00000000000..1434141d9e0
--- /dev/null
+++ b/arch/tile/lib/strnlen_32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+size_t strnlen(const char *s, size_t count)
+{
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint32_t *p = (const uint32_t *)(s_int & -4);
+ size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+ size_t len;
+ uint32_t v, bits;
+
+ /* Avoid page fault risk by not reading any bytes when count is 0. */
+ if (count == 0)
+ return 0;
+
+ /* Read first word, but force bytes before the string to be nonzero. */
+ v = *p | ((1 << ((s_int << 3) & 31)) - 1);
+
+ while ((bits = __insn_seqb(v, 0)) == 0) {
+ if (bytes_read >= count) {
+ /* Read COUNT bytes and didn't find the terminator. */
+ return count;
+ }
+ v = *++p;
+ bytes_read += sizeof(v);
+ }
+
+ len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
+ return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c
new file mode 100644
index 00000000000..2e8de6a5136
--- /dev/null
+++ b/arch/tile/lib/strnlen_64.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strnlen(const char *s, size_t count)
+{
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint64_t *p = (const uint64_t *)(s_int & -8);
+ size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+ size_t len;
+ uint64_t v, bits;
+
+ /* Avoid page fault risk by not reading any bytes when count is 0. */
+ if (count == 0)
+ return 0;
+
+ /* Read and MASK the first word. */
+ v = *p | MASK(s_int);
+
+ while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
+ if (bytes_read >= count) {
+ /* Read COUNT bytes and didn't find the terminator. */
+ return count;
+ }
+ v = *++p;
+ bytes_read += sizeof(v);
+ }
+
+ len = ((const char *) p) + (CFZ(bits) >> 3) - s;
+ return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c
index f8d398c9ee7..030abe3ee4f 100644
--- a/arch/tile/lib/uaccess.c
+++ b/arch/tile/lib/uaccess.c
@@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size)
is_arch_mappable_range(addr, size));
}
EXPORT_SYMBOL(__range_ok);
-
-#ifdef CONFIG_DEBUG_COPY_FROM_USER
-void copy_from_user_overflow(void)
-{
- WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
-#endif
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
index 979f76d8374..1bc16222463 100644
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -19,82 +19,6 @@
/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
- .pushsection .fixup,"ax"
-
-get_user_fault:
- { move r0, zero; move r1, zero }
- { movei r2, -EFAULT; jrp lr }
- ENDPROC(get_user_fault)
-
-put_user_fault:
- { movei r0, -EFAULT; jrp lr }
- ENDPROC(put_user_fault)
-
- .popsection
-
-/*
- * __get_user_N functions take a pointer in r0, and return 0 in r2
- * on success, with the value in r0; or else -EFAULT in r2.
- */
-#define __get_user_N(bytes, LOAD) \
- STD_ENTRY(__get_user_##bytes); \
-1: { LOAD r0, r0; move r1, zero; move r2, zero }; \
- jrp lr; \
- STD_ENDPROC(__get_user_##bytes); \
- .pushsection __ex_table,"a"; \
- .word 1b, get_user_fault; \
- .popsection
-
-__get_user_N(1, lb_u)
-__get_user_N(2, lh_u)
-__get_user_N(4, lw)
-
-/*
- * __get_user_8 takes a pointer in r0, and returns 0 in r2
- * on success, with the value in r0/r1; or else -EFAULT in r2.
- */
- STD_ENTRY(__get_user_8);
-1: { lw r0, r0; addi r1, r0, 4 };
-2: { lw r1, r1; move r2, zero };
- jrp lr;
- STD_ENDPROC(__get_user_8);
- .pushsection __ex_table,"a";
- .word 1b, get_user_fault;
- .word 2b, get_user_fault;
- .popsection
-
-/*
- * __put_user_N functions take a value in r0 and a pointer in r1,
- * and return 0 in r0 on success or -EFAULT on failure.
- */
-#define __put_user_N(bytes, STORE) \
- STD_ENTRY(__put_user_##bytes); \
-1: { STORE r1, r0; move r0, zero }; \
- jrp lr; \
- STD_ENDPROC(__put_user_##bytes); \
- .pushsection __ex_table,"a"; \
- .word 1b, put_user_fault; \
- .popsection
-
-__put_user_N(1, sb)
-__put_user_N(2, sh)
-__put_user_N(4, sw)
-
-/*
- * __put_user_8 takes a value in r0/r1 and a pointer in r2,
- * and returns 0 in r0 on success or -EFAULT on failure.
- */
-STD_ENTRY(__put_user_8)
-1: { sw r2, r0; addi r2, r2, 4 }
-2: { sw r2, r1; move r0, zero }
- jrp lr
- STD_ENDPROC(__put_user_8)
- .pushsection __ex_table,"a"
- .word 1b, put_user_fault
- .word 2b, put_user_fault
- .popsection
-
-
/*
* strnlen_user_asm takes the pointer in r0, and the length bound in r1.
* It returns the length, including the terminating NUL, or zero on exception.
@@ -112,6 +36,7 @@ strnlen_user_fault:
{ move r0, zero; jrp lr }
ENDPROC(strnlen_user_fault)
.section __ex_table,"a"
+ .align 4
.word 1b, strnlen_user_fault
.popsection
@@ -123,18 +48,20 @@ strnlen_user_fault:
*/
STD_ENTRY(strncpy_from_user_asm)
{ bz r2, 2f; move r3, r0 }
-1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
{ sb r0, r4; addi r0, r0, 1 }
- bz r2, 2f
- bnzt r4, 1b
- addi r0, r0, -1 /* don't count the trailing NUL */
-2: { sub r0, r0, r3; jrp lr }
+ bz r4, 2f
+ bnzt r2, 1b
+ { sub r0, r0, r3; jrp lr }
+2: addi r0, r0, -1 /* don't count the trailing NUL */
+ { sub r0, r0, r3; jrp lr }
STD_ENDPROC(strncpy_from_user_asm)
.pushsection .fixup,"ax"
strncpy_from_user_fault:
{ movei r0, -EFAULT; jrp lr }
ENDPROC(strncpy_from_user_fault)
.section __ex_table,"a"
+ .align 4
.word 1b, strncpy_from_user_fault
.popsection
@@ -153,6 +80,7 @@ STD_ENTRY(clear_user_asm)
bnzt r1, 1b
2: { move r0, r1; jrp lr }
.pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
@@ -162,6 +90,7 @@ STD_ENTRY(clear_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(clear_user_asm)
.pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
@@ -181,25 +110,7 @@ STD_ENTRY(flush_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(flush_user_asm)
.pushsection __ex_table,"a"
- .word 1b, 2b
- .popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
- bz r1, 2f
- { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
- { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
- { and r0, r0, r2; and r1, r1, r2 }
- { sub r1, r1, r0 }
-1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
- { addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
-2: { move r0, r1; jrp lr }
- STD_ENDPROC(inv_user_asm)
- .pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
@@ -219,5 +130,6 @@ STD_ENTRY(finv_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(finv_user_asm)
.pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
new file mode 100644
index 00000000000..b3b31a3306f
--- /dev/null
+++ b/arch/tile/lib/usercopy_64.S
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cache.h>
+#include <arch/chip.h>
+
+/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
+
+/*
+ * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
+ * It returns the length, including the terminating NUL, or zero on exception.
+ * If length is greater than the bound, returns one plus the bound.
+ */
+STD_ENTRY(strnlen_user_asm)
+ { beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */
+1: { ld1u r4, r0; addi r1, r1, -1 }
+ beqz r4, 2f
+ { bnezt r1, 1b; addi r0, r0, 1 }
+2: { sub r0, r0, r3; jrp lr }
+ STD_ENDPROC(strnlen_user_asm)
+ .pushsection .fixup,"ax"
+strnlen_user_fault:
+ { move r0, zero; jrp lr }
+ ENDPROC(strnlen_user_fault)
+ .section __ex_table,"a"
+ .align 8
+ .quad 1b, strnlen_user_fault
+ .popsection
+
+/*
+ * strncpy_from_user_asm takes the kernel target pointer in r0,
+ * the userspace source pointer in r1, and the length bound (including
+ * the trailing NUL) in r2. On success, it returns the string length
+ * (not including the trailing NUL), or -EFAULT on failure.
+ */
+STD_ENTRY(strncpy_from_user_asm)
+ { beqz r2, 2f; move r3, r0 }
+1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+ { st1 r0, r4; addi r0, r0, 1 }
+ beqz r4, 2f
+ bnezt r2, 1b
+ { sub r0, r0, r3; jrp lr }
+2: addi r0, r0, -1 /* don't count the trailing NUL */
+ { sub r0, r0, r3; jrp lr }
+ STD_ENDPROC(strncpy_from_user_asm)
+ .pushsection .fixup,"ax"
+strncpy_from_user_fault:
+ { movei r0, -EFAULT; jrp lr }
+ ENDPROC(strncpy_from_user_fault)
+ .section __ex_table,"a"
+ .align 8
+ .quad 1b, strncpy_from_user_fault
+ .popsection
+
+/*
+ * clear_user_asm takes the user target address in r0 and the
+ * number of bytes to zero in r1.
+ * It returns the number of uncopiable bytes (hopefully zero) in r0.
+ * Note that we don't use a separate .fixup section here since we fall
+ * through into the "fixup" code as the last straight-line bundle anyway.
+ */
+STD_ENTRY(clear_user_asm)
+ { beqz r1, 2f; or r2, r0, r1 }
+ andi r2, r2, 7
+ beqzt r2, .Lclear_aligned_user_asm
+1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
+ bnezt r1, 1b
+2: { move r0, r1; jrp lr }
+ .pushsection __ex_table,"a"
+ .align 8
+ .quad 1b, 2b
+ .popsection
+
+.Lclear_aligned_user_asm:
+1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
+ bnezt r1, 1b
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(clear_user_asm)
+ .pushsection __ex_table,"a"
+ .align 8
+ .quad 1b, 2b
+ .popsection
+
+/*
+ * flush_user_asm takes the user target address in r0 and the
+ * number of bytes to flush in r1.
+ * It returns the number of unflushable bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(flush_user_asm)
+ beqz r1, 2f
+ { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+ { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+ { and r0, r0, r2; and r1, r1, r2 }
+ { sub r1, r1, r0 }
+1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
+ { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(flush_user_asm)
+ .pushsection __ex_table,"a"
+ .align 8
+ .quad 1b, 2b
+ .popsection
+
+/*
+ * finv_user_asm takes the user target address in r0 and the
+ * number of bytes to flush-invalidate in r1.
+ * It returns the number of not finv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(finv_user_asm)
+ beqz r1, 2f
+ { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+ { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+ { and r0, r0, r2; and r1, r1, r2 }
+ { sub r1, r1, r0 }
+1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
+ { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(finv_user_asm)
+ .pushsection __ex_table,"a"
+ .align 8
+ .quad 1b, 2b
+ .popsection