diff options
Diffstat (limited to 'arch/tile/lib')
30 files changed, 1518 insertions, 897 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile index 93122d5b155..c4211cbb202 100644 --- a/arch/tile/lib/Makefile +++ b/arch/tile/lib/Makefile @@ -2,17 +2,17 @@  # Makefile for TILE-specific library files..  # -lib-y = cacheflush.o checksum.o cpumask.o delay.o \ -	mb_incoherent.o uaccess.o memmove.o \ -	memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ -	strchr_$(BITS).o strlen_$(BITS).o - -ifeq ($(CONFIG_TILEGX),y) -lib-y += memcpy_user_64.o -else -lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o -endif +lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \ +	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ +	strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o +lib-$(CONFIG_TILEGX) += memcpy_user_64.o +lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o  lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o  obj-$(CONFIG_MODULES) += exports.o + +# The finv_buffer_remote() and copy_{to,from}_user() routines can't +# have -pg added, since they both rely on being leaf functions. +CFLAGS_REMOVE_cacheflush.o = -pg +CFLAGS_REMOVE_memcpy_user_64.o = -pg diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 7a5cc706ab6..c89b211fd9e 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -17,55 +17,15 @@  #include <linux/uaccess.h>  #include <linux/module.h>  #include <linux/mm.h> -#include <asm/atomic.h> -#include <asm/futex.h> +#include <linux/atomic.h>  #include <arch/chip.h> -/* See <asm/atomic_32.h> */ -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - -/* - * A block of memory containing locks for atomic ops. Each instance of this - * struct will be homed on a different CPU. - */ -struct atomic_locks_on_cpu { -	int lock[ATOMIC_HASH_L2_SIZE]; -} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4))); - -static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool); - -/* The locks we'll use until __init_atomic_per_cpu is called. */ -static struct atomic_locks_on_cpu __initdata initial_atomic_locks; - -/* Hash into this vector to get a pointer to lock for the given atomic. */ -struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE] -	__write_once = { -	[0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks) -}; - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ -  /* This page is remapped on startup to be hash-for-home. */ -int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */] -  __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned"))); - -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ +int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; -static inline int *__atomic_hashed_lock(volatile void *v) +int *__atomic_hashed_lock(volatile void *v)  { -	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */ -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() -	unsigned long i = -		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); -	unsigned long n = __insn_crc32_32(0, i); - -	/* Grab high bits for L1 index. */ -	unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT); -	/* Grab low bits for L2 index. */ -	unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1); - -	return &atomic_lock_ptr[l1_index]->lock[l2_index]; -#else +	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */  	/*  	 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.  	 * Using mm works here because atomic_locks is page aligned. @@ -74,26 +34,13 @@ static inline int *__atomic_hashed_lock(volatile void *v)  				      (unsigned long)atomic_locks,  				      2, (ATOMIC_HASH_SHIFT + 2) - 1);  	return (int *)ptr; -#endif  }  #ifdef CONFIG_SMP  /* Return whether the passed pointer is a valid atomic lock pointer. */  static int is_atomic_lock(int *p)  { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() -	int i; -	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { - -		if (p >= &atomic_lock_ptr[i]->lock[0] && -		    p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) { -			return 1; -		} -	} -	return 0; -#else  	return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE]; -#endif  }  void __atomic_fault_unlock(int *irqlock_word) @@ -112,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)  	return __atomic_hashed_lock(v);  } -int _atomic_xchg(atomic_t *v, int n) +int _atomic_xchg(int *v, int n)  { -	return __atomic_xchg(&v->counter, __atomic_setup(v), n).val; +	return __atomic_xchg(v, __atomic_setup(v), n).val;  }  EXPORT_SYMBOL(_atomic_xchg); -int _atomic_xchg_add(atomic_t *v, int i) +int _atomic_xchg_add(int *v, int i)  { -	return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val; +	return __atomic_xchg_add(v, __atomic_setup(v), i).val;  }  EXPORT_SYMBOL(_atomic_xchg_add); -int _atomic_xchg_add_unless(atomic_t *v, int a, int u) +int _atomic_xchg_add_unless(int *v, int a, int u)  {  	/*  	 * Note: argument order is switched here since it is easier  	 * to use the first argument consistently as the "old value"  	 * in the assembly, as is done for _atomic_cmpxchg().  	 */ -	return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a) -		.val; +	return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;  }  EXPORT_SYMBOL(_atomic_xchg_add_unless); -int _atomic_cmpxchg(atomic_t *v, int o, int n) +int _atomic_cmpxchg(int *v, int o, int n)  { -	return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val; +	return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;  }  EXPORT_SYMBOL(_atomic_cmpxchg); @@ -161,78 +107,36 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)  EXPORT_SYMBOL(_atomic_xor); -u64 _atomic64_xchg(atomic64_t *v, u64 n) +long long _atomic64_xchg(long long *v, long long n)  { -	return __atomic64_xchg(&v->counter, __atomic_setup(v), n); +	return __atomic64_xchg(v, __atomic_setup(v), n);  }  EXPORT_SYMBOL(_atomic64_xchg); -u64 _atomic64_xchg_add(atomic64_t *v, u64 i) +long long _atomic64_xchg_add(long long *v, long long i)  { -	return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i); +	return __atomic64_xchg_add(v, __atomic_setup(v), i);  }  EXPORT_SYMBOL(_atomic64_xchg_add); -u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u) +long long _atomic64_xchg_add_unless(long long *v, long long a, long long u)  {  	/*  	 * Note: argument order is switched here since it is easier  	 * to use the first argument consistently as the "old value"  	 * in the assembly, as is done for _atomic_cmpxchg().  	 */ -	return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v), -					  u, a); +	return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);  }  EXPORT_SYMBOL(_atomic64_xchg_add_unless); -u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n) +long long _atomic64_cmpxchg(long long *v, long long o, long long n)  { -	return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n); +	return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);  }  EXPORT_SYMBOL(_atomic64_cmpxchg); -static inline int *__futex_setup(int __user *v) -{ -	/* -	 * Issue a prefetch to the counter to bring it into cache. -	 * As for __atomic_setup, but we can't do a read into the L1 -	 * since it might fault; instead we do a prefetch into the L2. -	 */ -	__insn_prefetch(v); -	return __atomic_hashed_lock((int __force *)v); -} - -struct __get_user futex_set(int __user *v, int i) -{ -	return __atomic_xchg((int __force *)v, __futex_setup(v), i); -} - -struct __get_user futex_add(int __user *v, int n) -{ -	return __atomic_xchg_add((int __force *)v, __futex_setup(v), n); -} - -struct __get_user futex_or(int __user *v, int n) -{ -	return __atomic_or((int __force *)v, __futex_setup(v), n); -} - -struct __get_user futex_andn(int __user *v, int n) -{ -	return __atomic_andn((int __force *)v, __futex_setup(v), n); -} - -struct __get_user futex_xor(int __user *v, int n) -{ -	return __atomic_xor((int __force *)v, __futex_setup(v), n); -} - -struct __get_user futex_cmpxchg(int __user *v, int o, int n) -{ -	return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n); -} -  /*   * If any of the atomic or futex routines hit a bad address (not in   * the page tables at kernel PL) this routine is called.  The futex @@ -251,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)  } -#if CHIP_HAS_CBOX_HOME_MAP() -static int __init noatomichash(char *str) -{ -	pr_warning("noatomichash is deprecated.\n"); -	return 1; -} -__setup("noatomichash", noatomichash); -#endif -  void __init __init_atomic_per_cpu(void)  { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - -	unsigned int i; -	int actual_cpu; - -	/* -	 * Before this is called from setup, we just have one lock for -	 * all atomic objects/operations.  Here we replace the -	 * elements of atomic_lock_ptr so that they point at per_cpu -	 * integers.  This seemingly over-complex approach stems from -	 * the fact that DEFINE_PER_CPU defines an entry for each cpu -	 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1.  But -	 * for efficient hashing of atomics to their locks we want a -	 * compile time constant power of 2 for the size of this -	 * table, so we use ATOMIC_HASH_SIZE. -	 * -	 * Here we populate atomic_lock_ptr from the per cpu -	 * atomic_lock_pool, interspersing by actual cpu so that -	 * subsequent elements are homed on consecutive cpus. -	 */ - -	actual_cpu = cpumask_first(cpu_possible_mask); - -	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { -		/* -		 * Preincrement to slightly bias against using cpu 0, -		 * which has plenty of stuff homed on it already. -		 */ -		actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask); -		if (actual_cpu >= nr_cpu_ids) -			actual_cpu = cpumask_first(cpu_possible_mask); - -		atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu); -	} - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ -  	/* Validate power-of-two and "bigger than cpus" assumption */  	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));  	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); @@ -322,9 +180,4 @@ void __init __init_atomic_per_cpu(void)  	 * That should not produce more indices than ATOMIC_HASH_SIZE.  	 */  	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); - -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - -	/* The futex code makes this assumption, so we validate it here. */ -	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));  } diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 5a5514b77e7..6bda3132cd6 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -14,7 +14,7 @@   * Support routines for atomic operations.  Each function takes:   *   * r0: address to manipulate - * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG) + * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)   * r2: new value to write, or for cmpxchg/add_unless, value to compare against   * r3: (cmpxchg/xchg_add_unless) new value to write or add;   *     (atomic64 ops) high word of value to write @@ -59,7 +59,7 @@   * bad kernel addresses).   *   * Note that if the value we would store is the same as what we - * loaded, we bypass the load.  Other platforms with true atomics can + * loaded, we bypass the store.  Other platforms with true atomics can   * make the guarantee that a non-atomic __clear_bit(), for example,   * can safely race with an atomic test_and_set_bit(); this example is   * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do @@ -70,7 +70,7 @@   */  #include <linux/linkage.h> -#include <asm/atomic.h> +#include <asm/atomic_32.h>  #include <asm/page.h>  #include <asm/processor.h> @@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)  	STD_ENDPROC(__atomic\name)  	.ifc \bitwidth,32  	.pushsection __ex_table,"a" +	.align  4  	.word   1b, __atomic\name  	.word   2b, __atomic\name  	.word   __atomic\name, __atomic_bad_address diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 11b6164c209..9c0ec22009a 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c @@ -12,12 +12,162 @@   *   more details.   */ +#include <linux/export.h>  #include <asm/page.h>  #include <asm/cacheflush.h>  #include <arch/icache.h> +#include <arch/spr_def.h>  void __flush_icache_range(unsigned long start, unsigned long end)  {  	invalidate_icache((const void *)start, end - start, PAGE_SIZE);  } + + +/* Force a load instruction to issue. */ +static inline void force_load(char *p) +{ +	*(volatile char *)p; +} + +/* + * Flush and invalidate a VA range that is homed remotely on a single + * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting + * until the memory controller holds the flushed values. + */ +void __attribute__((optimize("omit-frame-pointer"))) +finv_buffer_remote(void *buffer, size_t size, int hfh) +{ +	char *p, *base; +	size_t step_size, load_count; + +	/* +	 * On TILEPro the striping granularity is a fixed 8KB; on +	 * TILE-Gx it is configurable, and we rely on the fact that +	 * the hypervisor always configures maximum striping, so that +	 * bits 9 and 10 of the PA are part of the stripe function, so +	 * every 512 bytes we hit a striping boundary. +	 * +	 */ +#ifdef __tilegx__ +	const unsigned long STRIPE_WIDTH = 512; +#else +	const unsigned long STRIPE_WIDTH = 8192; +#endif + +#ifdef __tilegx__ +	/* +	 * On TILE-Gx, we must disable the dstream prefetcher before doing +	 * a cache flush; otherwise, we could end up with data in the cache +	 * that we don't want there.  Note that normally we'd do an mf +	 * after the SPR write to disabling the prefetcher, but we do one +	 * below, before any further loads, so there's no need to do it +	 * here. +	 */ +	uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF); +	__insn_mtspr(SPR_DSTREAM_PF, 0); +#endif + +	/* +	 * Flush and invalidate the buffer out of the local L1/L2 +	 * and request the home cache to flush and invalidate as well. +	 */ +	__finv_buffer(buffer, size); + +	/* +	 * Wait for the home cache to acknowledge that it has processed +	 * all the flush-and-invalidate requests.  This does not mean +	 * that the flushed data has reached the memory controller yet, +	 * but it does mean the home cache is processing the flushes. +	 */ +	__insn_mf(); + +	/* +	 * Issue a load to the last cache line, which can't complete +	 * until all the previously-issued flushes to the same memory +	 * controller have also completed.  If we weren't striping +	 * memory, that one load would be sufficient, but since we may +	 * be, we also need to back up to the last load issued to +	 * another memory controller, which would be the point where +	 * we crossed a "striping" boundary (the granularity of striping +	 * across memory controllers).  Keep backing up and doing this +	 * until we are before the beginning of the buffer, or have +	 * hit all the controllers. +	 * +	 * If we are flushing a hash-for-home buffer, it's even worse. +	 * Each line may be homed on a different tile, and each tile +	 * may have up to four lines that are on different +	 * controllers.  So as we walk backwards, we have to touch +	 * enough cache lines to satisfy these constraints.  In +	 * practice this ends up being close enough to "load from +	 * every cache line on a full memory stripe on each +	 * controller" that we simply do that, to simplify the logic. +	 * +	 * On TILE-Gx the hash-for-home function is much more complex, +	 * with the upshot being we can't readily guarantee we have +	 * hit both entries in the 128-entry AMT that were hit by any +	 * load in the entire range, so we just re-load them all. +	 * With larger buffers, we may want to consider using a hypervisor +	 * trap to issue loads directly to each hash-for-home tile for +	 * each controller (doing it from Linux would trash the TLB). +	 */ +	if (hfh) { +		step_size = L2_CACHE_BYTES; +#ifdef __tilegx__ +		load_count = (size + L2_CACHE_BYTES - 1) / L2_CACHE_BYTES; +#else +		load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) * +			      (1 << CHIP_LOG_NUM_MSHIMS()); +#endif +	} else { +		step_size = STRIPE_WIDTH; +		load_count = (1 << CHIP_LOG_NUM_MSHIMS()); +	} + +	/* Load the last byte of the buffer. */ +	p = (char *)buffer + size - 1; +	force_load(p); + +	/* Bump down to the end of the previous stripe or cache line. */ +	p -= step_size; +	p = (char *)((unsigned long)p | (step_size - 1)); + +	/* Figure out how far back we need to go. */ +	base = p - (step_size * (load_count - 2)); +	if ((unsigned long)base < (unsigned long)buffer) +		base = buffer; + +	/* +	 * Fire all the loads we need.  The MAF only has eight entries +	 * so we can have at most eight outstanding loads, so we +	 * unroll by that amount. +	 */ +#pragma unroll 8 +	for (; p >= base; p -= step_size) +		force_load(p); + +	/* +	 * Repeat, but with finv's instead of loads, to get rid of the +	 * data we just loaded into our own cache and the old home L3. +	 * No need to unroll since finv's don't target a register. +	 * The finv's are guaranteed not to actually flush the data in +	 * the buffer back to their home, since we just read it, so the +	 * lines are clean in cache; we will only invalidate those lines. +	 */ +	p = (char *)buffer + size - 1; +	__insn_finv(p); +	p -= step_size; +	p = (char *)((unsigned long)p | (step_size - 1)); +	for (; p >= base; p -= step_size) +		__insn_finv(p); + +	/* Wait for these finv's (and thus the first finvs) to be done. */ +	__insn_mf(); + +#ifdef __tilegx__ +	/* Reenable the prefetcher. */ +	__insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf); +#endif +} +EXPORT_SYMBOL_GPL(finv_buffer_remote); diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c index e4bab5bd3f3..c3ca3e64d9d 100644 --- a/arch/tile/lib/checksum.c +++ b/arch/tile/lib/checksum.c @@ -16,19 +16,6 @@  #include <net/checksum.h>  #include <linux/module.h> -static inline unsigned int longto16(unsigned long x) -{ -	unsigned long ret; -#ifdef __tilegx__ -	ret = __insn_v2sadu(x, 0); -	ret = __insn_v2sadu(ret, 0); -#else -	ret = __insn_sadh_u(x, 0); -	ret = __insn_sadh_u(ret, 0); -#endif -	return ret; -} -  __wsum do_csum(const unsigned char *buff, int len)  {  	int odd, count; @@ -94,7 +81,7 @@ __wsum do_csum(const unsigned char *buff, int len)  	}  	if (len & 1)  		result += *buff; -	result = longto16(result); +	result = csum_long(result);  	if (odd)  		result = swab16(result);  out: diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c index fdc403614d1..75947edccb2 100644 --- a/arch/tile/lib/cpumask.c +++ b/arch/tile/lib/cpumask.c @@ -16,6 +16,7 @@  #include <linux/ctype.h>  #include <linux/errno.h>  #include <linux/smp.h> +#include <linux/export.h>  /*   * Allow cropping out bits beyond the end of the array. @@ -50,3 +51,4 @@ int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits)  	} while (*bp != '\0' && *bp != '\n');  	return 0;  } +EXPORT_SYMBOL(bitmap_parselist_crop); diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c index 5801b03c13e..cdacdd11d36 100644 --- a/arch/tile/lib/delay.c +++ b/arch/tile/lib/delay.c @@ -15,20 +15,31 @@  #include <linux/module.h>  #include <linux/delay.h>  #include <linux/thread_info.h> -#include <asm/fixmap.h> -#include <hv/hypervisor.h> +#include <asm/timex.h>  void __udelay(unsigned long usecs)  { -	hv_nanosleep(usecs * 1000); +	if (usecs > ULONG_MAX / 1000) { +		WARN_ON_ONCE(usecs > ULONG_MAX / 1000); +		usecs = ULONG_MAX / 1000; +	} +	__ndelay(usecs * 1000);  }  EXPORT_SYMBOL(__udelay);  void __ndelay(unsigned long nsecs)  { -	hv_nanosleep(nsecs); +	cycles_t target = get_cycles(); +	target += ns2cycles(nsecs); +	while (get_cycles() < target) +		cpu_relax();  }  EXPORT_SYMBOL(__ndelay); -/* FIXME: should be declared in a header somewhere. */ +void __delay(unsigned long cycles) +{ +	cycles_t target = get_cycles() + cycles; +	while (get_cycles() < target) +		cpu_relax(); +}  EXPORT_SYMBOL(__delay); diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c index 1509c559765..82733c87d67 100644 --- a/arch/tile/lib/exports.c +++ b/arch/tile/lib/exports.c @@ -18,17 +18,11 @@  /* arch/tile/lib/usercopy.S */  #include <linux/uaccess.h> -EXPORT_SYMBOL(__get_user_1); -EXPORT_SYMBOL(__get_user_2); -EXPORT_SYMBOL(__get_user_4); -EXPORT_SYMBOL(__get_user_8); -EXPORT_SYMBOL(__put_user_1); -EXPORT_SYMBOL(__put_user_2); -EXPORT_SYMBOL(__put_user_4); -EXPORT_SYMBOL(__put_user_8);  EXPORT_SYMBOL(strnlen_user_asm);  EXPORT_SYMBOL(strncpy_from_user_asm);  EXPORT_SYMBOL(clear_user_asm); +EXPORT_SYMBOL(flush_user_asm); +EXPORT_SYMBOL(finv_user_asm);  /* arch/tile/kernel/entry.S */  #include <linux/kernel.h> @@ -36,6 +30,15 @@ EXPORT_SYMBOL(clear_user_asm);  EXPORT_SYMBOL(current_text_addr);  EXPORT_SYMBOL(dump_stack); +/* arch/tile/kernel/head.S */ +EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_FUNCTION_TRACER +/* arch/tile/kernel/mcount_64.S */ +#include <asm/ftrace.h> +EXPORT_SYMBOL(__mcount); +#endif /* CONFIG_FUNCTION_TRACER */ +  /* arch/tile/lib/, various memcpy files */  EXPORT_SYMBOL(memcpy);  EXPORT_SYMBOL(__copy_to_user_inatomic); @@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);  EXPORT_SYMBOL(__copy_in_user_inatomic);  #endif -/* arch/tile/lib/mb_incoherent.S */ -EXPORT_SYMBOL(__mb_incoherent); -  /* hypervisor glue */  #include <hv/hypervisor.h>  EXPORT_SYMBOL(hv_dev_open); @@ -60,6 +60,8 @@ EXPORT_SYMBOL(hv_dev_poll_cancel);  EXPORT_SYMBOL(hv_dev_close);  EXPORT_SYMBOL(hv_sysconf);  EXPORT_SYMBOL(hv_confstr); +EXPORT_SYMBOL(hv_get_rtc); +EXPORT_SYMBOL(hv_set_rtc);  /* libgcc.a */  uint32_t __udivsi3(uint32_t dividend, uint32_t divisor); @@ -79,10 +81,14 @@ EXPORT_SYMBOL(__umoddi3);  int64_t __moddi3(int64_t dividend, int64_t divisor);  EXPORT_SYMBOL(__moddi3);  #ifndef __tilegx__ -uint64_t __ll_mul(uint64_t n0, uint64_t n1); -EXPORT_SYMBOL(__ll_mul);  int64_t __muldi3(int64_t, int64_t);  EXPORT_SYMBOL(__muldi3);  uint64_t __lshrdi3(uint64_t, unsigned int);  EXPORT_SYMBOL(__lshrdi3); +uint64_t __ashrdi3(uint64_t, unsigned int); +EXPORT_SYMBOL(__ashrdi3); +uint64_t __ashldi3(uint64_t, unsigned int); +EXPORT_SYMBOL(__ashldi3); +int __ffsdi2(uint64_t); +EXPORT_SYMBOL(__ffsdi2);  #endif diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S deleted file mode 100644 index 989ad7b68d5..00000000000 --- a/arch/tile/lib/mb_incoherent.S +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - *   This program is free software; you can redistribute it and/or - *   modify it under the terms of the GNU General Public License - *   as published by the Free Software Foundation, version 2. - * - *   This program is distributed in the hope that it will be useful, but - *   WITHOUT ANY WARRANTY; without even the implied warranty of - *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - *   NON INFRINGEMENT.  See the GNU General Public License for - *   more details. - * - * Assembly code for invoking the HV's fence_incoherent syscall. - */ - -#include <linux/linkage.h> -#include <hv/syscall_public.h> -#include <arch/abi.h> -#include <arch/chip.h> - -#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS() - -/* - * Invoke the hypervisor's fence_incoherent syscall, which guarantees - * that all victims for cachelines homed on this tile have reached memory. - */ -STD_ENTRY(__mb_incoherent) -	moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent -	swint2 -	jrp lr -	STD_ENDPROC(__mb_incoherent) - -#endif diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c index 6235283b485..cc3d9badf03 100644 --- a/arch/tile/lib/memchr_32.c +++ b/arch/tile/lib/memchr_32.c @@ -18,12 +18,24 @@  void *memchr(const void *s, int c, size_t n)  { +	const uint32_t *last_word_ptr; +	const uint32_t *p; +	const char *last_byte_ptr; +	uintptr_t s_int; +	uint32_t goal, before_mask, v, bits; +	char *ret; + +	if (__builtin_expect(n == 0, 0)) { +		/* Don't dereference any memory if the array is empty. */ +		return NULL; +	} +  	/* Get an aligned pointer. */ -	const uintptr_t s_int = (uintptr_t) s; -	const uint32_t *p = (const uint32_t *)(s_int & -4); +	s_int = (uintptr_t) s; +	p = (const uint32_t *)(s_int & -4);  	/* Create four copies of the byte for which we are looking. */ -	const uint32_t goal = 0x01010101 * (uint8_t) c; +	goal = 0x01010101 * (uint8_t) c;  	/* Read the first word, but munge it so that bytes before the array  	 * will not match goal. @@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n)  	 * Note that this shift count expression works because we know  	 * shift counts are taken mod 32.  	 */ -	const uint32_t before_mask = (1 << (s_int << 3)) - 1; -	uint32_t v = (*p | before_mask) ^ (goal & before_mask); +	before_mask = (1 << (s_int << 3)) - 1; +	v = (*p | before_mask) ^ (goal & before_mask);  	/* Compute the address of the last byte. */ -	const char *const last_byte_ptr = (const char *)s + n - 1; +	last_byte_ptr = (const char *)s + n - 1;  	/* Compute the address of the word containing the last byte. */ -	const uint32_t *const last_word_ptr = -	    (const uint32_t *)((uintptr_t) last_byte_ptr & -4); - -	uint32_t bits; -	char *ret; - -	if (__builtin_expect(n == 0, 0)) { -		/* Don't dereference any memory if the array is empty. */ -		return NULL; -	} +	last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);  	while ((bits = __insn_seqb(v, goal)) == 0) {  		if (__builtin_expect(p == last_word_ptr, 0)) { diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c new file mode 100644 index 00000000000..f8196b3a950 --- /dev/null +++ b/arch/tile/lib/memchr_64.c @@ -0,0 +1,69 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#include "string-endian.h" + +void *memchr(const void *s, int c, size_t n) +{ +	const uint64_t *last_word_ptr; +	const uint64_t *p; +	const char *last_byte_ptr; +	uintptr_t s_int; +	uint64_t goal, before_mask, v, bits; +	char *ret; + +	if (__builtin_expect(n == 0, 0)) { +		/* Don't dereference any memory if the array is empty. */ +		return NULL; +	} + +	/* Get an aligned pointer. */ +	s_int = (uintptr_t) s; +	p = (const uint64_t *)(s_int & -8); + +	/* Create eight copies of the byte for which we are looking. */ +	goal = copy_byte(c); + +	/* Read the first word, but munge it so that bytes before the array +	 * will not match goal. +	 */ +	before_mask = MASK(s_int); +	v = (*p | before_mask) ^ (goal & before_mask); + +	/* Compute the address of the last byte. */ +	last_byte_ptr = (const char *)s + n - 1; + +	/* Compute the address of the word containing the last byte. */ +	last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8); + +	while ((bits = __insn_v1cmpeq(v, goal)) == 0) { +		if (__builtin_expect(p == last_word_ptr, 0)) { +			/* We already read the last word in the array, +			 * so give up. +			 */ +			return NULL; +		} +		v = *++p; +	} + +	/* We found a match, but it might be in a byte past the end +	 * of the array. +	 */ +	ret = ((char *)p) + (CFZ(bits) >> 3); +	return (ret <= last_byte_ptr) ? ret : NULL; +} +EXPORT_SYMBOL(memchr); diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S index 2a419a6122d..a2771ae5da5 100644 --- a/arch/tile/lib/memcpy_32.S +++ b/arch/tile/lib/memcpy_32.S @@ -22,14 +22,6 @@  #include <linux/linkage.h> -/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() -#define memcpy __memcpy_asm -#define __copy_to_user_inatomic __copy_to_user_inatomic_asm -#define __copy_from_user_inatomic __copy_from_user_inatomic_asm -#define __copy_from_user_zeroing __copy_from_user_zeroing_asm -#endif -  #define IS_MEMCPY	  0  #define IS_COPY_FROM_USER  1  #define IS_COPY_FROM_USER_ZEROING  2 @@ -44,6 +36,7 @@   */  #define EX \  	.pushsection __ex_table, "a"; \ +	.align 4; \  	.word 9f, memcpy_common_fixup; \  	.popsection; \  	9 @@ -158,12 +151,9 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }  	{ addi r3, r1, 60; andi r9, r9, -64 } -#if CHIP_HAS_WH64()  	/* No need to prefetch dst, we'll just do the wh64  	 * right before we copy a line.  	 */ -#endif -  EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }  	/* Intentionally stall for a few cycles to leave L2 cache alone. */  	{ bnzt zero, .; move r27, lr } @@ -171,21 +161,6 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }  	/* Intentionally stall for a few cycles to leave L2 cache alone. */  	{ bnzt zero, . }  EX:	{ lw r7, r3; addi r3, r3, 64 } -#if !CHIP_HAS_WH64() -	/* Prefetch the dest */ -	/* Intentionally stall for a few cycles to leave L2 cache alone. */ -	{ bnzt zero, . } -	/* Use a real load to cause a TLB miss if necessary.  We aren't using -	 * r28, so this should be fine. -	 */ -EX:	{ lw r28, r9; addi r9, r9, 64 } -	/* Intentionally stall for a few cycles to leave L2 cache alone. */ -	{ bnzt zero, . } -	{ prefetch r9; addi r9, r9, 64 } -	/* Intentionally stall for a few cycles to leave L2 cache alone. */ -	{ bnzt zero, . } -	{ prefetch r9; addi r9, r9, 64 } -#endif  	/* Intentionally stall for a few cycles to leave L2 cache alone. */  	{ bz zero, .Lbig_loop2 } @@ -286,13 +261,8 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }  	/* Fill second L1D line. */  EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ -#if CHIP_HAS_WH64()  	/* Prepare destination line for writing. */  EX:	{ wh64 r9; addi r9, r9, 64 } -#else -	/* Prefetch dest line */ -	{ prefetch r9; addi r9, r9, 64 } -#endif  	/* Load seven words that are L1D hits to cover wh64 L2 usage. */  	/* Load the three remaining words from the last L1D line, which @@ -330,16 +300,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */  EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */  EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */  EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ -#if CHIP_HAS_WH64()  EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ -#else -	/* Back up the r9 to a cache line we are already storing to -	 * if it gets past the end of the dest vector.  Strictly speaking, -	 * we don't need to back up to the start of a cache line, but it's free -	 * and tidy, so why not? -	 */ -EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ -#endif  	/* Store second L1D line. */  EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */  EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */ @@ -403,7 +364,6 @@ EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }  .Ldest_is_word_aligned: -#if CHIP_HAS_DWORD_ALIGN()  EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}  	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } @@ -511,26 +471,6 @@ EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }  	/* Move r1 back to the point where it corresponds to r0. */  	{ addi r1, r1, -4 } -#else /* !CHIP_HAS_DWORD_ALIGN() */ - -	/* Compute right/left shift counts and load initial source words. */ -	{ andi r5, r1, -4; andi r3, r1, 3 } -EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 } -EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 } - -	/* Load and store one word at a time, using shifts and ORs -	 * to correct for the misaligned src. -	 */ -.Lcopy_unaligned_src_loop: -	{ shr r6, r6, r3; shl r8, r7, r4 } -EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 } -EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 } -	{ addi r5, r5, 4; slti_u r8, r2, 8 } -	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 } - -	{ bz r2, .Lcopy_unaligned_done } -#endif /* !CHIP_HAS_DWORD_ALIGN() */ -  	/* Fall through */  /* @@ -614,5 +554,6 @@ memcpy_fixup_loop:  	.size memcpy_common_fixup, . - memcpy_common_fixup  	.section __ex_table,"a" +	.align 4  	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder  	.word .Lctu, .Lcopy_to_user_fixup_done diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c new file mode 100644 index 00000000000..4815354b8cd --- /dev/null +++ b/arch/tile/lib/memcpy_64.c @@ -0,0 +1,367 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ + +/* Must be 8 bytes in size. */ +#define op_t uint64_t + +/* Threshold value for when to enter the unrolled loops. */ +#define	OP_T_THRES	16 + +#if CHIP_L2_LINE_SIZE() != 64 +#error "Assumes 64 byte line size" +#endif + +/* How many cache lines ahead should we prefetch? */ +#define PREFETCH_LINES_AHEAD 4 + +/* + * Provide "base versions" of load and store for the normal code path. + * The kernel provides other versions for userspace copies. + */ +#define ST(p, v) (*(p) = (v)) +#define LD(p) (*(p)) + +#ifndef USERCOPY_FUNC +#define ST1 ST +#define ST2 ST +#define ST4 ST +#define ST8 ST +#define LD1 LD +#define LD2 LD +#define LD4 LD +#define LD8 LD +#define RETVAL dstv +void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n) +#else +/* + * Special kernel version will provide implementation of the LDn/STn + * macros to return a count of uncopied bytes due to mm fault. + */ +#define RETVAL 0 +int __attribute__((optimize("omit-frame-pointer"))) +USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) +#endif +{ +	char *__restrict dst1 = (char *)dstv; +	const char *__restrict src1 = (const char *)srcv; +	const char *__restrict src1_end; +	const char *__restrict prefetch; +	op_t *__restrict dst8;    /* 8-byte pointer to destination memory. */ +	op_t final; /* Final bytes to write to trailing word, if any */ +	long i; + +	if (n < 16) { +		for (; n; n--) +			ST1(dst1++, LD1(src1++)); +		return RETVAL; +	} + +	/* +	 * Locate the end of source memory we will copy.  Don't +	 * prefetch past this. +	 */ +	src1_end = src1 + n - 1; + +	/* Prefetch ahead a few cache lines, but not past the end. */ +	prefetch = src1; +	for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { +		__insn_prefetch(prefetch); +		prefetch += CHIP_L2_LINE_SIZE(); +		prefetch = (prefetch < src1_end) ? prefetch : src1; +	} + +	/* Copy bytes until dst is word-aligned. */ +	for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--) +		ST1(dst1++, LD1(src1++)); + +	/* 8-byte pointer to destination memory. */ +	dst8 = (op_t *)dst1; + +	if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) { +		/* Unaligned copy. */ + +		op_t  tmp0 = 0, tmp1 = 0, tmp2, tmp3; +		const op_t *src8 = (const op_t *) ((uintptr_t)src1 & +						   -sizeof(op_t)); +		const void *srci = (void *)src1; +		int m; + +		m = (CHIP_L2_LINE_SIZE() << 2) - +			(((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1)); +		m = (n < m) ? n : m; +		m /= sizeof(op_t); + +		/* Copy until 'dst' is cache-line-aligned. */ +		n -= (sizeof(op_t) * m); + +		switch (m % 4) { +		case 0: +			if (__builtin_expect(!m, 0)) +				goto _M0; +			tmp1 = LD8(src8++); +			tmp2 = LD8(src8++); +			goto _8B3; +		case 2: +			m += 2; +			tmp3 = LD8(src8++); +			tmp0 = LD8(src8++); +			goto _8B1; +		case 3: +			m += 1; +			tmp2 = LD8(src8++); +			tmp3 = LD8(src8++); +			goto _8B2; +		case 1: +			m--; +			tmp0 = LD8(src8++); +			tmp1 = LD8(src8++); +			if (__builtin_expect(!m, 0)) +				goto _8B0; +		} + +		do { +			tmp2 = LD8(src8++); +			tmp0 =  __insn_dblalign(tmp0, tmp1, srci); +			ST8(dst8++, tmp0); +_8B3: +			tmp3 = LD8(src8++); +			tmp1 = __insn_dblalign(tmp1, tmp2, srci); +			ST8(dst8++, tmp1); +_8B2: +			tmp0 = LD8(src8++); +			tmp2 = __insn_dblalign(tmp2, tmp3, srci); +			ST8(dst8++, tmp2); +_8B1: +			tmp1 = LD8(src8++); +			tmp3 = __insn_dblalign(tmp3, tmp0, srci); +			ST8(dst8++, tmp3); +			m -= 4; +		} while (m); + +_8B0: +		tmp0 = __insn_dblalign(tmp0, tmp1, srci); +		ST8(dst8++, tmp0); +		src8--; + +_M0: +		if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) { +			op_t tmp4, tmp5, tmp6, tmp7, tmp8; + +			prefetch = ((const char *)src8) + +				CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD; + +			for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE(); +			     n -= CHIP_L2_LINE_SIZE()) { +				/* Prefetch and advance to next line to +				   prefetch, but don't go past the end.  */ +				__insn_prefetch(prefetch); + +				/* Make sure prefetch got scheduled +				   earlier.  */ +				__asm__ ("" : : : "memory"); + +				prefetch += CHIP_L2_LINE_SIZE(); +				prefetch = (prefetch < src1_end) ? prefetch : +					(const char *) src8; + +				tmp1 = LD8(src8++); +				tmp2 = LD8(src8++); +				tmp3 = LD8(src8++); +				tmp4 = LD8(src8++); +				tmp5 = LD8(src8++); +				tmp6 = LD8(src8++); +				tmp7 = LD8(src8++); +				tmp8 = LD8(src8++); + +				tmp0 = __insn_dblalign(tmp0, tmp1, srci); +				tmp1 = __insn_dblalign(tmp1, tmp2, srci); +				tmp2 = __insn_dblalign(tmp2, tmp3, srci); +				tmp3 = __insn_dblalign(tmp3, tmp4, srci); +				tmp4 = __insn_dblalign(tmp4, tmp5, srci); +				tmp5 = __insn_dblalign(tmp5, tmp6, srci); +				tmp6 = __insn_dblalign(tmp6, tmp7, srci); +				tmp7 = __insn_dblalign(tmp7, tmp8, srci); + +				__insn_wh64(dst8); + +				ST8(dst8++, tmp0); +				ST8(dst8++, tmp1); +				ST8(dst8++, tmp2); +				ST8(dst8++, tmp3); +				ST8(dst8++, tmp4); +				ST8(dst8++, tmp5); +				ST8(dst8++, tmp6); +				ST8(dst8++, tmp7); + +				tmp0 = tmp8; +			} +			src8--; +		} + +		/* Copy the rest 8-byte chunks. */ +		if (n >= sizeof(op_t)) { +			tmp0 = LD8(src8++); +			for (; n >= sizeof(op_t); n -= sizeof(op_t)) { +				tmp1 = LD8(src8++); +				tmp0 = __insn_dblalign(tmp0, tmp1, srci); +				ST8(dst8++, tmp0); +				tmp0 = tmp1; +			} +			src8--; +		} + +		if (n == 0) +			return RETVAL; + +		tmp0 = LD8(src8++); +		tmp1 = ((const char *)src8 <= src1_end) +			? LD8((op_t *)src8) : 0; +		final = __insn_dblalign(tmp0, tmp1, srci); + +	} else { +		/* Aligned copy. */ + +		const op_t *__restrict src8 = (const op_t *)src1; + +		/* src8 and dst8 are both word-aligned. */ +		if (n >= CHIP_L2_LINE_SIZE()) { +			/* Copy until 'dst' is cache-line-aligned. */ +			for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); +			     n -= sizeof(op_t)) +				ST8(dst8++, LD8(src8++)); + +			for (; n >= CHIP_L2_LINE_SIZE(); ) { +				op_t tmp0, tmp1, tmp2, tmp3; +				op_t tmp4, tmp5, tmp6, tmp7; + +				/* +				 * Prefetch and advance to next line +				 * to prefetch, but don't go past the +				 * end. +				 */ +				__insn_prefetch(prefetch); + +				/* Make sure prefetch got scheduled +				   earlier.  */ +				__asm__ ("" : : : "memory"); + +				prefetch += CHIP_L2_LINE_SIZE(); +				prefetch = (prefetch < src1_end) ? prefetch : +					(const char *)src8; + +				/* +				 * Do all the loads before wh64.  This +				 * is necessary if [src8, src8+7] and +				 * [dst8, dst8+7] share the same cache +				 * line and dst8 <= src8, as can be +				 * the case when called from memmove, +				 * or with code tested on x86 whose +				 * memcpy always works with forward +				 * copies. +				 */ +				tmp0 = LD8(src8++); +				tmp1 = LD8(src8++); +				tmp2 = LD8(src8++); +				tmp3 = LD8(src8++); +				tmp4 = LD8(src8++); +				tmp5 = LD8(src8++); +				tmp6 = LD8(src8++); +				tmp7 = LD8(src8++); + +				/* wh64 and wait for tmp7 load completion. */ +				__asm__ ("move %0, %0; wh64 %1\n" +					 : : "r"(tmp7), "r"(dst8)); + +				ST8(dst8++, tmp0); +				ST8(dst8++, tmp1); +				ST8(dst8++, tmp2); +				ST8(dst8++, tmp3); +				ST8(dst8++, tmp4); +				ST8(dst8++, tmp5); +				ST8(dst8++, tmp6); +				ST8(dst8++, tmp7); + +				n -= CHIP_L2_LINE_SIZE(); +			} +#if CHIP_L2_LINE_SIZE() != 64 +# error "Fix code that assumes particular L2 cache line size." +#endif +		} + +		for (; n >= sizeof(op_t); n -= sizeof(op_t)) +			ST8(dst8++, LD8(src8++)); + +		if (__builtin_expect(n == 0, 1)) +			return RETVAL; + +		final = LD8(src8); +	} + +	/* n != 0 if we get here.  Write out any trailing bytes. */ +	dst1 = (char *)dst8; +#ifndef __BIG_ENDIAN__ +	if (n & 4) { +		ST4((uint32_t *)dst1, final); +		dst1 += 4; +		final >>= 32; +		n &= 3; +	} +	if (n & 2) { +		ST2((uint16_t *)dst1, final); +		dst1 += 2; +		final >>= 16; +		n &= 1; +	} +	if (n) +		ST1((uint8_t *)dst1, final); +#else +	if (n & 4) { +		ST4((uint32_t *)dst1, final >> 32); +		dst1 += 4; +        } +        else +        { +		final >>= 32; +        } +	if (n & 2) { +		ST2((uint16_t *)dst1, final >> 16); +		dst1 += 2; +        } +        else +        { +		final >>= 16; +        } +	if (n & 1) +		ST1((uint8_t *)dst1, final >> 8); +#endif + +	return RETVAL; +} + +#ifdef USERCOPY_FUNC +#undef ST1 +#undef ST2 +#undef ST4 +#undef ST8 +#undef LD1 +#undef LD2 +#undef LD4 +#undef LD8 +#undef USERCOPY_FUNC +#endif diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c deleted file mode 100644 index f7d4a6ad61e..00000000000 --- a/arch/tile/lib/memcpy_tile64.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - *   This program is free software; you can redistribute it and/or - *   modify it under the terms of the GNU General Public License - *   as published by the Free Software Foundation, version 2. - * - *   This program is distributed in the hope that it will be useful, but - *   WITHOUT ANY WARRANTY; without even the implied warranty of - *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - *   NON INFRINGEMENT.  See the GNU General Public License for - *   more details. - */ - -#include <linux/string.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/uaccess.h> -#include <asm/fixmap.h> -#include <asm/kmap_types.h> -#include <asm/tlbflush.h> -#include <hv/hypervisor.h> -#include <arch/chip.h> - - -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() - -/* Defined in memcpy.S */ -extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n); -extern unsigned long __copy_to_user_inatomic_asm( -	void __user *to, const void *from, unsigned long n); -extern unsigned long __copy_from_user_inatomic_asm( -	void *to, const void __user *from, unsigned long n); -extern unsigned long __copy_from_user_zeroing_asm( -	void *to, const void __user *from, unsigned long n); - -typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); - -/* Size above which to consider TLB games for performance */ -#define LARGE_COPY_CUTOFF 2048 - -/* Communicate to the simulator what we are trying to do. */ -#define sim_allow_multiple_caching(b) \ -  __insn_mtspr(SPR_SIM_CONTROL, \ -   SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS)) - -/* - * Copy memory by briefly enabling incoherent cacheline-at-a-time mode. - * - * We set up our own source and destination PTEs that we fully control. - * This is the only way to guarantee that we don't race with another - * thread that is modifying the PTE; we can't afford to try the - * copy_{to,from}_user() technique of catching the interrupt, since - * we must run with interrupts disabled to avoid the risk of some - * other code seeing the incoherent data in our cache.  (Recall that - * our cache is indexed by PA, so even if the other code doesn't use - * our kmap_atomic virtual addresses, they'll still hit in cache using - * the normal VAs that aren't supposed to hit in cache.) - */ -static void memcpy_multicache(void *dest, const void *source, -			      pte_t dst_pte, pte_t src_pte, int len) -{ -	int idx; -	unsigned long flags, newsrc, newdst; -	pmd_t *pmdp; -	pte_t *ptep; -	int type0, type1; -	int cpu = get_cpu(); - -	/* -	 * Disable interrupts so that we don't recurse into memcpy() -	 * in an interrupt handler, nor accidentally reference -	 * the PA of the source from an interrupt routine.  Also -	 * notify the simulator that we're playing games so we don't -	 * generate spurious coherency warnings. -	 */ -	local_irq_save(flags); -	sim_allow_multiple_caching(1); - -	/* Set up the new dest mapping */ -	type0 = kmap_atomic_idx_push(); -	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0; -	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); -	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); -	ptep = pte_offset_kernel(pmdp, newdst); -	if (pte_val(*ptep) != pte_val(dst_pte)) { -		set_pte(ptep, dst_pte); -		local_flush_tlb_page(NULL, newdst, PAGE_SIZE); -	} - -	/* Set up the new source mapping */ -	type1 = kmap_atomic_idx_push(); -	idx += (type0 - type1); -	src_pte = hv_pte_set_nc(src_pte); -	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */ -	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); -	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); -	ptep = pte_offset_kernel(pmdp, newsrc); -	*ptep = src_pte;   /* set_pte() would be confused by this */ -	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); - -	/* Actually move the data. */ -	__memcpy_asm((void *)newdst, (const void *)newsrc, len); - -	/* -	 * Remap the source as locally-cached and not OLOC'ed so that -	 * we can inval without also invaling the remote cpu's cache. -	 * This also avoids known errata with inv'ing cacheable oloc data. -	 */ -	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); -	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ -	*ptep = src_pte;   /* set_pte() would be confused by this */ -	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); - -	/* -	 * Do the actual invalidation, covering the full L2 cache line -	 * at the end since __memcpy_asm() is somewhat aggressive. -	 */ -	__inv_buffer((void *)newsrc, len); - -	/* -	 * We're done: notify the simulator that all is back to normal, -	 * and re-enable interrupts and pre-emption. -	 */ -	kmap_atomic_idx_pop(); -	kmap_atomic_idx_pop(); -	sim_allow_multiple_caching(0); -	local_irq_restore(flags); -	put_cpu(); -} - -/* - * Identify large copies from remotely-cached memory, and copy them - * via memcpy_multicache() if they look good, otherwise fall back - * to the particular kind of copying passed as the memcpy_t function. - */ -static unsigned long fast_copy(void *dest, const void *source, int len, -			       memcpy_t func) -{ -	/* -	 * Check if it's big enough to bother with.  We may end up doing a -	 * small copy via TLB manipulation if we're near a page boundary, -	 * but presumably we'll make it up when we hit the second page. -	 */ -	while (len >= LARGE_COPY_CUTOFF) { -		int copy_size, bytes_left_on_page; -		pte_t *src_ptep, *dst_ptep; -		pte_t src_pte, dst_pte; -		struct page *src_page, *dst_page; - -		/* Is the source page oloc'ed to a remote cpu? */ -retry_source: -		src_ptep = virt_to_pte(current->mm, (unsigned long)source); -		if (src_ptep == NULL) -			break; -		src_pte = *src_ptep; -		if (!hv_pte_get_present(src_pte) || -		    !hv_pte_get_readable(src_pte) || -		    hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) -			break; -		if (get_remote_cache_cpu(src_pte) == smp_processor_id()) -			break; -		src_page = pfn_to_page(hv_pte_get_pfn(src_pte)); -		get_page(src_page); -		if (pte_val(src_pte) != pte_val(*src_ptep)) { -			put_page(src_page); -			goto retry_source; -		} -		if (pte_huge(src_pte)) { -			/* Adjust the PTE to correspond to a small page */ -			int pfn = hv_pte_get_pfn(src_pte); -			pfn += (((unsigned long)source & (HPAGE_SIZE-1)) -				>> PAGE_SHIFT); -			src_pte = pfn_pte(pfn, src_pte); -			src_pte = pte_mksmall(src_pte); -		} - -		/* Is the destination page writable? */ -retry_dest: -		dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); -		if (dst_ptep == NULL) { -			put_page(src_page); -			break; -		} -		dst_pte = *dst_ptep; -		if (!hv_pte_get_present(dst_pte) || -		    !hv_pte_get_writable(dst_pte)) { -			put_page(src_page); -			break; -		} -		dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte)); -		if (dst_page == src_page) { -			/* -			 * Source and dest are on the same page; this -			 * potentially exposes us to incoherence if any -			 * part of src and dest overlap on a cache line. -			 * Just give up rather than trying to be precise. -			 */ -			put_page(src_page); -			break; -		} -		get_page(dst_page); -		if (pte_val(dst_pte) != pte_val(*dst_ptep)) { -			put_page(dst_page); -			goto retry_dest; -		} -		if (pte_huge(dst_pte)) { -			/* Adjust the PTE to correspond to a small page */ -			int pfn = hv_pte_get_pfn(dst_pte); -			pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) -				>> PAGE_SHIFT); -			dst_pte = pfn_pte(pfn, dst_pte); -			dst_pte = pte_mksmall(dst_pte); -		} - -		/* All looks good: create a cachable PTE and copy from it */ -		copy_size = len; -		bytes_left_on_page = -			PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); -		if (copy_size > bytes_left_on_page) -			copy_size = bytes_left_on_page; -		bytes_left_on_page = -			PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); -		if (copy_size > bytes_left_on_page) -			copy_size = bytes_left_on_page; -		memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); - -		/* Release the pages */ -		put_page(dst_page); -		put_page(src_page); - -		/* Continue on the next page */ -		dest += copy_size; -		source += copy_size; -		len -= copy_size; -	} - -	return func(dest, source, len); -} - -void *memcpy(void *to, const void *from, __kernel_size_t n) -{ -	if (n < LARGE_COPY_CUTOFF) -		return (void *)__memcpy_asm(to, from, n); -	else -		return (void *)fast_copy(to, from, n, __memcpy_asm); -} - -unsigned long __copy_to_user_inatomic(void __user *to, const void *from, -				      unsigned long n) -{ -	if (n < LARGE_COPY_CUTOFF) -		return __copy_to_user_inatomic_asm(to, from, n); -	else -		return fast_copy(to, from, n, __copy_to_user_inatomic_asm); -} - -unsigned long __copy_from_user_inatomic(void *to, const void __user *from, -					unsigned long n) -{ -	if (n < LARGE_COPY_CUTOFF) -		return __copy_from_user_inatomic_asm(to, from, n); -	else -		return fast_copy(to, from, n, __copy_from_user_inatomic_asm); -} - -unsigned long __copy_from_user_zeroing(void *to, const void __user *from, -				       unsigned long n) -{ -	if (n < LARGE_COPY_CUTOFF) -		return __copy_from_user_zeroing_asm(to, from, n); -	else -		return fast_copy(to, from, n, __copy_from_user_zeroing_asm); -} - -#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */ diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c new file mode 100644 index 00000000000..88c7016492c --- /dev/null +++ b/arch/tile/lib/memcpy_user_64.c @@ -0,0 +1,94 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + * + * Do memcpy(), but trap and return "n" when a load or store faults. + * + * Note: this idiom only works when memcpy() compiles to a leaf function. + * Here leaf function not only means it does not have calls, but also + * requires no stack operations (sp, stack frame pointer) and no + * use of callee-saved registers, else "jrp lr" will be incorrect since + * unwinding stack frame is bypassed. Since memcpy() is not complex so + * these conditions are satisfied here, but we need to be careful when + * modifying this file. This is not a clean solution but is the best + * one so far. + * + * Also note that we are capturing "n" from the containing scope here. + */ + +#define _ST(p, inst, v)						\ +	({							\ +		asm("1: " #inst " %0, %1;"			\ +		    ".pushsection .coldtext.memcpy,\"ax\";"	\ +		    "2: { move r0, %2; jrp lr };"		\ +		    ".section __ex_table,\"a\";"		\ +		    ".align 8;"					\ +		    ".quad 1b, 2b;"				\ +		    ".popsection"				\ +		    : "=m" (*(p)) : "r" (v), "r" (n));		\ +	}) + +#define _LD(p, inst)						\ +	({							\ +		unsigned long __v;				\ +		asm("1: " #inst " %0, %1;"			\ +		    ".pushsection .coldtext.memcpy,\"ax\";"	\ +		    "2: { move r0, %2; jrp lr };"		\ +		    ".section __ex_table,\"a\";"		\ +		    ".align 8;"					\ +		    ".quad 1b, 2b;"				\ +		    ".popsection"				\ +		    : "=r" (__v) : "m" (*(p)), "r" (n));	\ +		__v;						\ +	}) + +#define USERCOPY_FUNC __copy_to_user_inatomic +#define ST1(p, v) _ST((p), st1, (v)) +#define ST2(p, v) _ST((p), st2, (v)) +#define ST4(p, v) _ST((p), st4, (v)) +#define ST8(p, v) _ST((p), st, (v)) +#define LD1 LD +#define LD2 LD +#define LD4 LD +#define LD8 LD +#include "memcpy_64.c" + +#define USERCOPY_FUNC __copy_from_user_inatomic +#define ST1 ST +#define ST2 ST +#define ST4 ST +#define ST8 ST +#define LD1(p) _LD((p), ld1u) +#define LD2(p) _LD((p), ld2u) +#define LD4(p) _LD((p), ld4u) +#define LD8(p) _LD((p), ld) +#include "memcpy_64.c" + +#define USERCOPY_FUNC __copy_in_user_inatomic +#define ST1(p, v) _ST((p), st1, (v)) +#define ST2(p, v) _ST((p), st2, (v)) +#define ST4(p, v) _ST((p), st4, (v)) +#define ST8(p, v) _ST((p), st, (v)) +#define LD1(p) _LD((p), ld1u) +#define LD2(p) _LD((p), ld2u) +#define LD4(p) _LD((p), ld4u) +#define LD8(p) _LD((p), ld) +#include "memcpy_64.c" + +unsigned long __copy_from_user_zeroing(void *to, const void __user *from, +				       unsigned long n) +{ +	unsigned long rc = __copy_from_user_inatomic(to, from, n); +	if (unlikely(rc)) +		memset(to + n - rc, 0, rc); +	return rc; +} diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 57dbb3a5bff..2042bfe6595 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c @@ -12,13 +12,10 @@   *   more details.   */ -#include <arch/chip.h> -  #include <linux/types.h>  #include <linux/string.h>  #include <linux/module.h> - -#undef memset +#include <arch/chip.h>  void *memset(void *s, int c, size_t n)  { @@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)  	int n32;  	uint32_t v16, v32;  	uint8_t *out8 = s; -#if !CHIP_HAS_WH64() -	int ahead32; -#else  	int to_align32; -#endif  	/* Experimentation shows that a trivial tight loop is a win up until  	 * around a size of 20, where writing a word at a time starts to win. @@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)  		return s;  	} -#if !CHIP_HAS_WH64() -	/* Use a spare issue slot to start prefetching the first cache -	 * line early. This instruction is free as the store can be buried -	 * in otherwise idle issue slots doing ALU ops. -	 */ -	__insn_prefetch(out8); - -	/* We prefetch the end so that a short memset that spans two cache -	 * lines gets some prefetching benefit. Again we believe this is free -	 * to issue. -	 */ -	__insn_prefetch(&out8[n - 1]); -#endif /* !CHIP_HAS_WH64() */ - -  	/* Align 'out8'. We know n >= 3 so this won't write past the end. */  	while (((uintptr_t) out8 & 3) != 0) {  		*out8++ = c; @@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)  	/* This must be at least 8 or the following loop doesn't work. */  #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) -#if !CHIP_HAS_WH64() - -	ahead32 = CACHE_LINE_SIZE_IN_WORDS; - -	/* We already prefetched the first and last cache lines, so -	 * we only need to do more prefetching if we are storing -	 * to more than two cache lines. -	 */ -	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { -		int i; - -		/* Prefetch the next several cache lines. -		 * This is the setup code for the software-pipelined -		 * loop below. -		 */ -#define MAX_PREFETCH 5 -		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; -		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) -			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; - -		for (i = CACHE_LINE_SIZE_IN_WORDS; -		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) -			__insn_prefetch(&out32[i]); -	} - -	if (n32 > ahead32) { -		while (1) { -			int j; - -			/* Prefetch by reading one word several cache lines -			 * ahead.  Since loads are non-blocking this will -			 * cause the full cache line to be read while we are -			 * finishing earlier cache lines.  Using a store -			 * here causes microarchitectural performance -			 * problems where a victimizing store miss goes to -			 * the head of the retry FIFO and locks the pipe for -			 * a few cycles.  So a few subsequent stores in this -			 * loop go into the retry FIFO, and then later -			 * stores see other stores to the same cache line -			 * are already in the retry FIFO and themselves go -			 * into the retry FIFO, filling it up and grinding -			 * to a halt waiting for the original miss to be -			 * satisfied. -			 */ -			__insn_prefetch(&out32[ahead32]); - -#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 -#error "Unhandled CACHE_LINE_SIZE_IN_WORDS" -#endif - -			n32 -= CACHE_LINE_SIZE_IN_WORDS; - -			/* Save icache space by only partially unrolling -			 * this loop. -			 */ -			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { -				*out32++ = v32; -				*out32++ = v32; -				*out32++ = v32; -				*out32++ = v32; -			} - -			/* To save compiled code size, reuse this loop even -			 * when we run out of prefetching to do by dropping -			 * ahead32 down. -			 */ -			if (n32 <= ahead32) { -				/* Not even a full cache line left, -				 * so stop now. -				 */ -				if (n32 < CACHE_LINE_SIZE_IN_WORDS) -					break; - -				/* Choose a small enough value that we don't -				 * prefetch past the end.  There's no sense -				 * in touching cache lines we don't have to. -				 */ -				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; -			} -		} -	} - -#else /* CHIP_HAS_WH64() */ -  	/* Determine how many words we need to emit before the 'out32'  	 * pointer becomes aligned modulo the cache line size.  	 */ @@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)  		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;  	} -#endif /* CHIP_HAS_WH64() */ -  	/* Now handle any leftover values. */  	if (n32 != 0) {  		do { diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c new file mode 100644 index 00000000000..03ef69cd73d --- /dev/null +++ b/arch/tile/lib/memset_64.c @@ -0,0 +1,142 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#include <arch/chip.h> +#include "string-endian.h" + +void *memset(void *s, int c, size_t n) +{ +	uint64_t *out64; +	int n64, to_align64; +	uint64_t v64; +	uint8_t *out8 = s; + +	/* Experimentation shows that a trivial tight loop is a win up until +	 * around a size of 20, where writing a word at a time starts to win. +	 */ +#define BYTE_CUTOFF 20 + +#if BYTE_CUTOFF < 7 +	/* This must be at least at least this big, or some code later +	 * on doesn't work. +	 */ +#error "BYTE_CUTOFF is too small" +#endif + +	if (n < BYTE_CUTOFF) { +		/* Strangely, this turns out to be the tightest way to +		 * write this loop. +		 */ +		if (n != 0) { +			do { +				/* Strangely, combining these into one line +				 * performs worse. +				 */ +				*out8 = c; +				out8++; +			} while (--n != 0); +		} + +		return s; +	} + +	/* Align 'out8'. We know n >= 7 so this won't write past the end. */ +	while (((uintptr_t) out8 & 7) != 0) { +		*out8++ = c; +		--n; +	} + +	/* Align 'n'. */ +	while (n & 7) +		out8[--n] = c; + +	out64 = (uint64_t *) out8; +	n64 = n >> 3; + +	/* Tile input byte out to 64 bits. */ +	v64 = copy_byte(c); + +	/* This must be at least 8 or the following loop doesn't work. */ +#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) + +	/* Determine how many words we need to emit before the 'out32' +	 * pointer becomes aligned modulo the cache line size. +	 */ +	to_align64 = (-((uintptr_t)out64 >> 3)) & +		(CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1); + +	/* Only bother aligning and using wh64 if there is at least +	 * one full cache line to process.  This check also prevents +	 * overrunning the end of the buffer with alignment words. +	 */ +	if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) { +		int lines_left; + +		/* Align out64 mod the cache line size so we can use wh64. */ +		n64 -= to_align64; +		for (; to_align64 != 0; to_align64--) { +			*out64 = v64; +			out64++; +		} + +		/* Use unsigned divide to turn this into a right shift. */ +		lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS; + +		do { +			/* Only wh64 a few lines at a time, so we don't +			 * exceed the maximum number of victim lines. +			 */ +			int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) +				  ? lines_left +				  : CHIP_MAX_OUTSTANDING_VICTIMS()); +			uint64_t *wh = out64; +			int i = x; +			int j; + +			lines_left -= x; + +			do { +				__insn_wh64(wh); +				wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS; +			} while (--i); + +			for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4); +			     j != 0; j--) { +				*out64++ = v64; +				*out64++ = v64; +				*out64++ = v64; +				*out64++ = v64; +			} +		} while (lines_left != 0); + +		/* We processed all full lines above, so only this many +		 * words remain to be processed. +		 */ +		n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1; +	} + +	/* Now handle any leftover values. */ +	if (n64 != 0) { +		do { +			*out64 = v64; +			out64++; +		} while (--n64 != 0); +	} + +	return s; +} +EXPORT_SYMBOL(memset); diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 485e24d62c6..b34f79aada4 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c @@ -15,6 +15,7 @@  #include <linux/spinlock.h>  #include <linux/module.h>  #include <asm/processor.h> +#include <arch/spr_def.h>  #include "spinlock_common.h" @@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);  #define RD_COUNT_MASK   ((1 << RD_COUNT_WIDTH) - 1) -/* Lock the word, spinning until there are no tns-ers. */ -static inline u32 get_rwlock(arch_rwlock_t *rwlock) -{ -	u32 iterations = 0; -	for (;;) { -		u32 val = __insn_tns((int *)&rwlock->lock); -		if (unlikely(val & 1)) { -			delay_backoff(iterations++); -			continue; -		} -		return val; -	} -} - -int arch_read_trylock_slow(arch_rwlock_t *rwlock) -{ -	u32 val = get_rwlock(rwlock); -	int locked = (val << RD_COUNT_WIDTH) == 0; -	rwlock->lock = val + (locked << RD_COUNT_SHIFT); -	return locked; -} -EXPORT_SYMBOL(arch_read_trylock_slow); - -void arch_read_unlock_slow(arch_rwlock_t *rwlock) -{ -	u32 val = get_rwlock(rwlock); -	rwlock->lock = val - (1 << RD_COUNT_SHIFT); -} -EXPORT_SYMBOL(arch_read_unlock_slow); - -void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val) +/* + * We can get the read lock if everything but the reader bits (which + * are in the high part of the word) is zero, i.e. no active or + * waiting writers, no tns. + * + * We guard the tns/store-back with an interrupt critical section to + * preserve the semantic that the same read lock can be acquired in an + * interrupt context. + */ +int arch_read_trylock(arch_rwlock_t *rwlock)  { -	u32 eq, mask = 1 << WR_CURR_SHIFT; -	while (unlikely(val & 1)) { -		/* Limited backoff since we are the highest-priority task. */ -		relax(4); -		val = __insn_tns((int *)&rwlock->lock); +	u32 val; +	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); +	val = __insn_tns((int *)&rwlock->lock); +	if (likely((val << _RD_COUNT_WIDTH) == 0)) { +		val += 1 << RD_COUNT_SHIFT; +		rwlock->lock = val; +		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); +		BUG_ON(val == 0);  /* we don't expect wraparound */ +		return 1;  	} -	val = __insn_addb(val, mask); -	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); -	val = __insn_mz(eq & mask, val); -	rwlock->lock = val; +	if ((val & 1) == 0) +		rwlock->lock = val; +	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); +	return 0;  } -EXPORT_SYMBOL(arch_write_unlock_slow); +EXPORT_SYMBOL(arch_read_trylock);  /* - * We spin until everything but the reader bits (which are in the high - * part of the word) are zero, i.e. no active or waiting writers, no tns. - * + * Spin doing arch_read_trylock() until we acquire the lock.   * ISSUE: This approach can permanently starve readers.  A reader who sees   * a writer could instead take a ticket lock (just like a writer would),   * and atomically enter read mode (with 1 reader) when it gets the ticket. - * This way both readers and writers will always make forward progress + * This way both readers and writers would always make forward progress   * in a finite time.   */ -void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val) +void arch_read_lock(arch_rwlock_t *rwlock)  {  	u32 iterations = 0; -	do { -		if (!(val & 1)) -			rwlock->lock = val; +	while (unlikely(!arch_read_trylock(rwlock)))  		delay_backoff(iterations++); +} +EXPORT_SYMBOL(arch_read_lock); + +void arch_read_unlock(arch_rwlock_t *rwlock) +{ +	u32 val, iterations = 0; + +	mb();  /* guarantee anything modified under the lock is visible */ +	for (;;) { +		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);  		val = __insn_tns((int *)&rwlock->lock); -	} while ((val << RD_COUNT_WIDTH) != 0); -	rwlock->lock = val + (1 << RD_COUNT_SHIFT); +		if (likely((val & 1) == 0)) { +			rwlock->lock = val - (1 << _RD_COUNT_SHIFT); +			__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); +			break; +		} +		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); +		delay_backoff(iterations++); +	}  } -EXPORT_SYMBOL(arch_read_lock_slow); +EXPORT_SYMBOL(arch_read_unlock); -void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) +/* + * We don't need an interrupt critical section here (unlike for + * arch_read_lock) since we should never use a bare write lock where + * it could be interrupted by code that could try to re-acquire it. + */ +void arch_write_lock(arch_rwlock_t *rwlock)  {  	/*  	 * The trailing underscore on this variable (and curr_ below) @@ -167,23 +168,36 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)  	 * when we compare them.  	 */  	u32 my_ticket_; +	u32 iterations = 0; +	u32 val = __insn_tns((int *)&rwlock->lock); -	/* Take out the next ticket; this will also stop would-be readers. */ -	if (val & 1) -		val = get_rwlock(rwlock); -	rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); - -	/* Extract my ticket value from the original word. */ -	my_ticket_ = val >> WR_NEXT_SHIFT; +	if (likely(val == 0)) { +		rwlock->lock = 1 << _WR_NEXT_SHIFT; +		return; +	}  	/* -	 * Wait until the "current" field matches our ticket, and -	 * there are no remaining readers. +	 * Wait until there are no readers, then bump up the next +	 * field and capture the ticket value.  	 */  	for (;;) { +		if (!(val & 1)) { +			if ((val >> RD_COUNT_SHIFT) == 0) +				break; +			rwlock->lock = val; +		} +		delay_backoff(iterations++); +		val = __insn_tns((int *)&rwlock->lock); +	} + +	/* Take out the next ticket and extract my ticket value. */ +	rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); +	my_ticket_ = val >> WR_NEXT_SHIFT; + +	/* Wait until the "current" field matches our ticket. */ +	for (;;) {  		u32 curr_ = val >> WR_CURR_SHIFT; -		u32 readers = val >> RD_COUNT_SHIFT; -		u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers; +		u32 delta = ((my_ticket_ - curr_) & WR_MASK);  		if (likely(delta == 0))  			break; @@ -199,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)  			relax(4);  	}  } -EXPORT_SYMBOL(arch_write_lock_slow); +EXPORT_SYMBOL(arch_write_lock); -int __tns_atomic_acquire(atomic_t *lock) +int arch_write_trylock(arch_rwlock_t *rwlock)  { -	int ret; -	u32 iterations = 0; +	u32 val = __insn_tns((int *)&rwlock->lock); -	BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION)); -	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); +	/* +	 * If a tns is in progress, or there's a waiting or active locker, +	 * or active readers, we can't take the lock, so give up. +	 */ +	if (unlikely(val != 0)) { +		if (!(val & 1)) +			rwlock->lock = val; +		return 0; +	} -	while ((ret = __insn_tns((void *)&lock->counter)) == 1) -		delay_backoff(iterations++); -	return ret; +	/* Set the "next" field to mark it locked. */ +	rwlock->lock = 1 << _WR_NEXT_SHIFT; +	return 1;  } +EXPORT_SYMBOL(arch_write_trylock); -void __tns_atomic_release(atomic_t *p, int v) +void arch_write_unlock(arch_rwlock_t *rwlock)  { -	p->counter = v; -	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); +	u32 val, eq, mask; + +	mb();  /* guarantee anything modified under the lock is visible */ +	val = __insn_tns((int *)&rwlock->lock); +	if (likely(val == (1 << _WR_NEXT_SHIFT))) { +		rwlock->lock = 0; +		return; +	} +	while (unlikely(val & 1)) { +		/* Limited backoff since we are the highest-priority task. */ +		relax(4); +		val = __insn_tns((int *)&rwlock->lock); +	} +	mask = 1 << WR_CURR_SHIFT; +	val = __insn_addb(val, mask); +	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); +	val = __insn_mz(eq & mask, val); +	rwlock->lock = val;  } +EXPORT_SYMBOL(arch_write_unlock); diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c new file mode 100644 index 00000000000..d6fb9581e98 --- /dev/null +++ b/arch/tile/lib/spinlock_64.c @@ -0,0 +1,104 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/spinlock.h> +#include <linux/module.h> +#include <asm/processor.h> + +#include "spinlock_common.h" + +/* + * Read the spinlock value without allocating in our cache and without + * causing an invalidation to another cpu with a copy of the cacheline. + * This is important when we are spinning waiting for the lock. + */ +static inline u32 arch_spin_read_noalloc(void *lock) +{ +	return atomic_cmpxchg((atomic_t *)lock, -1, -1); +} + +/* + * Wait until the high bits (current) match my ticket. + * If we notice the overflow bit set on entry, we clear it. + */ +void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket) +{ +	if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) { +		__insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW); +		my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW; +	} + +	for (;;) { +		u32 val = arch_spin_read_noalloc(lock); +		u32 delta = my_ticket - arch_spin_current(val); +		if (delta == 0) +			return; +		relax((128 / CYCLES_PER_RELAX_LOOP) * delta); +	} +} +EXPORT_SYMBOL(arch_spin_lock_slow); + +/* + * Check the lock to see if it is plausible, and try to get it with cmpxchg(). + */ +int arch_spin_trylock(arch_spinlock_t *lock) +{ +	u32 val = arch_spin_read_noalloc(lock); +	if (unlikely(arch_spin_current(val) != arch_spin_next(val))) +		return 0; +	return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW) +		== val; +} +EXPORT_SYMBOL(arch_spin_trylock); + +void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ +	u32 iterations = 0; +	while (arch_spin_is_locked(lock)) +		delay_backoff(iterations++); +} +EXPORT_SYMBOL(arch_spin_unlock_wait); + +/* + * If the read lock fails due to a writer, we retry periodically + * until the value is positive and we write our incremented reader count. + */ +void __read_lock_failed(arch_rwlock_t *rw) +{ +	u32 val; +	int iterations = 0; +	do { +		delay_backoff(iterations++); +		val = __insn_fetchaddgez4(&rw->lock, 1); +	} while (unlikely(arch_write_val_locked(val))); +} +EXPORT_SYMBOL(__read_lock_failed); + +/* + * If we failed because there were readers, clear the "writer" bit + * so we don't block additional readers.  Otherwise, there was another + * writer anyway, so our "fetchor" made no difference.  Then wait, + * issuing periodic fetchor instructions, till we get the lock. + */ +void __write_lock_failed(arch_rwlock_t *rw, u32 val) +{ +	int iterations = 0; +	do { +		if (!arch_write_val_locked(val)) +			val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT); +		delay_backoff(iterations++); +		val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT); +	} while (val != 0); +} +EXPORT_SYMBOL(__write_lock_failed); diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h index c1010980913..6ac37509fac 100644 --- a/arch/tile/lib/spinlock_common.h +++ b/arch/tile/lib/spinlock_common.h @@ -60,5 +60,5 @@ static void delay_backoff(int iterations)  	loops += __insn_crc32_32(stack_pointer, get_cycles_low()) &  		(loops - 1); -	relax(1 << exponent); +	relax(loops);  } diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c index c94e6f7ae7b..841fe696301 100644 --- a/arch/tile/lib/strchr_32.c +++ b/arch/tile/lib/strchr_32.c @@ -16,8 +16,6 @@  #include <linux/string.h>  #include <linux/module.h> -#undef strchr -  char *strchr(const char *s, int c)  {  	int z, g; diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c new file mode 100644 index 00000000000..fe6e31c06f8 --- /dev/null +++ b/arch/tile/lib/strchr_64.c @@ -0,0 +1,62 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#include "string-endian.h" + +char *strchr(const char *s, int c) +{ +	int z, g; + +	/* Get an aligned pointer. */ +	const uintptr_t s_int = (uintptr_t) s; +	const uint64_t *p = (const uint64_t *)(s_int & -8); + +	/* Create eight copies of the byte for which we are looking. */ +	const uint64_t goal = copy_byte(c); + +	/* Read the first aligned word, but force bytes before the string to +	 * match neither zero nor goal (we make sure the high bit of each +	 * byte is 1, and the low 7 bits are all the opposite of the goal +	 * byte). +	 */ +	const uint64_t before_mask = MASK(s_int); +	uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1)); + +	uint64_t zero_matches, goal_matches; +	while (1) { +		/* Look for a terminating '\0'. */ +		zero_matches = __insn_v1cmpeqi(v, 0); + +		/* Look for the goal byte. */ +		goal_matches = __insn_v1cmpeq(v, goal); + +		if (__builtin_expect((zero_matches | goal_matches) != 0, 0)) +			break; + +		v = *++p; +	} + +	z = CFZ(zero_matches); +	g = CFZ(goal_matches); + +	/* If we found c before '\0' we got a match. Note that if c == '\0' +	 * then g == z, and we correctly return the address of the '\0' +	 * rather than NULL. +	 */ +	return (g <= z) ? ((char *)p) + (g >> 3) : NULL; +} +EXPORT_SYMBOL(strchr); diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h new file mode 100644 index 00000000000..2e49cbfe937 --- /dev/null +++ b/arch/tile/lib/string-endian.h @@ -0,0 +1,44 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + * + * Provide a mask based on the pointer alignment that + * sets up non-zero bytes before the beginning of the string. + * The MASK expression works because shift counts are taken mod 64. + * Also, specify how to count "first" and "last" bits + * when the bits have been read as a word. + */ + +#include <asm/byteorder.h> + +#ifdef __LITTLE_ENDIAN +#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1) +#define NULMASK(x) ((2ULL << x) - 1) +#define CFZ(x) __insn_ctz(x) +#define REVCZ(x) __insn_clz(x) +#else +#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1))) +#define NULMASK(x) (-2LL << (63 - x)) +#define CFZ(x) __insn_clz(x) +#define REVCZ(x) __insn_ctz(x) +#endif + +/* + * Create eight copies of the byte in a uint64_t.  Byte Shuffle uses + * the bytes of srcB as the index into the dest vector to select a + * byte.  With all indices of zero, the first byte is copied into all + * the other bytes. + */ +static inline uint64_t copy_byte(uint8_t byte) +{ +	return __insn_shufflebytes(byte, 0, 0); +} diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c index 4974292a553..f26f88e11e4 100644 --- a/arch/tile/lib/strlen_32.c +++ b/arch/tile/lib/strlen_32.c @@ -16,8 +16,6 @@  #include <linux/string.h>  #include <linux/module.h> -#undef strlen -  size_t strlen(const char *s)  {  	/* Get an aligned pointer. */ diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c new file mode 100644 index 00000000000..9583fc3361f --- /dev/null +++ b/arch/tile/lib/strlen_64.c @@ -0,0 +1,35 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#include "string-endian.h" + +size_t strlen(const char *s) +{ +	/* Get an aligned pointer. */ +	const uintptr_t s_int = (uintptr_t) s; +	const uint64_t *p = (const uint64_t *)(s_int & -8); + +	/* Read and MASK the first word. */ +	uint64_t v = *p | MASK(s_int); + +	uint64_t bits; +	while ((bits = __insn_v1cmpeqi(v, 0)) == 0) +		v = *++p; + +	return ((const char *)p) + (CFZ(bits) >> 3) - s; +} +EXPORT_SYMBOL(strlen); diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c new file mode 100644 index 00000000000..1434141d9e0 --- /dev/null +++ b/arch/tile/lib/strnlen_32.c @@ -0,0 +1,47 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> + +size_t strnlen(const char *s, size_t count) +{ +	/* Get an aligned pointer. */ +	const uintptr_t s_int = (uintptr_t) s; +	const uint32_t *p = (const uint32_t *)(s_int & -4); +	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1)); +	size_t len; +	uint32_t v, bits; + +	/* Avoid page fault risk by not reading any bytes when count is 0. */ +	if (count == 0) +		return 0; + +	/* Read first word, but force bytes before the string to be nonzero. */ +	v = *p | ((1 << ((s_int << 3) & 31)) - 1); + +	while ((bits = __insn_seqb(v, 0)) == 0) { +		if (bytes_read >= count) { +			/* Read COUNT bytes and didn't find the terminator. */ +			return count; +		} +		v = *++p; +		bytes_read += sizeof(v); +	} + +	len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s; +	return (len < count ? len : count); +} +EXPORT_SYMBOL(strnlen); diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c new file mode 100644 index 00000000000..2e8de6a5136 --- /dev/null +++ b/arch/tile/lib/strnlen_64.c @@ -0,0 +1,48 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#include "string-endian.h" + +size_t strnlen(const char *s, size_t count) +{ +	/* Get an aligned pointer. */ +	const uintptr_t s_int = (uintptr_t) s; +	const uint64_t *p = (const uint64_t *)(s_int & -8); +	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1)); +	size_t len; +	uint64_t v, bits; + +	/* Avoid page fault risk by not reading any bytes when count is 0. */ +	if (count == 0) +		return 0; + +	/* Read and MASK the first word. */ +	v = *p | MASK(s_int); + +	while ((bits = __insn_v1cmpeqi(v, 0)) == 0) { +		if (bytes_read >= count) { +			/* Read COUNT bytes and didn't find the terminator. */ +			return count; +		} +		v = *++p; +		bytes_read += sizeof(v); +	} + +	len = ((const char *) p) + (CFZ(bits) >> 3) - s; +	return (len < count ? len : count); +} +EXPORT_SYMBOL(strnlen); diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c index f8d398c9ee7..030abe3ee4f 100644 --- a/arch/tile/lib/uaccess.c +++ b/arch/tile/lib/uaccess.c @@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size)  		 is_arch_mappable_range(addr, size));  }  EXPORT_SYMBOL(__range_ok); - -#ifdef CONFIG_DEBUG_COPY_FROM_USER -void copy_from_user_overflow(void) -{ -       WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); -#endif diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S index 979f76d8374..1bc16222463 100644 --- a/arch/tile/lib/usercopy_32.S +++ b/arch/tile/lib/usercopy_32.S @@ -19,82 +19,6 @@  /* Access user memory, but use MMU to avoid propagating kernel exceptions. */ -	.pushsection .fixup,"ax" - -get_user_fault: -	{ move r0, zero; move r1, zero } -	{ movei r2, -EFAULT; jrp lr } -	ENDPROC(get_user_fault) - -put_user_fault: -	{ movei r0, -EFAULT; jrp lr } -	ENDPROC(put_user_fault) - -	.popsection - -/* - * __get_user_N functions take a pointer in r0, and return 0 in r2 - * on success, with the value in r0; or else -EFAULT in r2. - */ -#define __get_user_N(bytes, LOAD) \ -	STD_ENTRY(__get_user_##bytes); \ -1:	{ LOAD r0, r0; move r1, zero; move r2, zero }; \ -	jrp lr; \ -	STD_ENDPROC(__get_user_##bytes); \ -	.pushsection __ex_table,"a"; \ -	.word 1b, get_user_fault; \ -	.popsection - -__get_user_N(1, lb_u) -__get_user_N(2, lh_u) -__get_user_N(4, lw) - -/* - * __get_user_8 takes a pointer in r0, and returns 0 in r2 - * on success, with the value in r0/r1; or else -EFAULT in r2. - */ -	STD_ENTRY(__get_user_8); -1:	{ lw r0, r0; addi r1, r0, 4 }; -2:	{ lw r1, r1; move r2, zero }; -	jrp lr; -	STD_ENDPROC(__get_user_8); -	.pushsection __ex_table,"a"; -	.word 1b, get_user_fault; -	.word 2b, get_user_fault; -	.popsection - -/* - * __put_user_N functions take a value in r0 and a pointer in r1, - * and return 0 in r0 on success or -EFAULT on failure. - */ -#define __put_user_N(bytes, STORE) \ -	STD_ENTRY(__put_user_##bytes); \ -1:	{ STORE r1, r0; move r0, zero }; \ -	jrp lr; \ -	STD_ENDPROC(__put_user_##bytes); \ -	.pushsection __ex_table,"a"; \ -	.word 1b, put_user_fault; \ -	.popsection - -__put_user_N(1, sb) -__put_user_N(2, sh) -__put_user_N(4, sw) - -/* - * __put_user_8 takes a value in r0/r1 and a pointer in r2, - * and returns 0 in r0 on success or -EFAULT on failure. - */ -STD_ENTRY(__put_user_8) -1:      { sw r2, r0; addi r2, r2, 4 } -2:      { sw r2, r1; move r0, zero } -	jrp lr -	STD_ENDPROC(__put_user_8) -	.pushsection __ex_table,"a" -	.word 1b, put_user_fault -	.word 2b, put_user_fault -	.popsection - -  /*   * strnlen_user_asm takes the pointer in r0, and the length bound in r1.   * It returns the length, including the terminating NUL, or zero on exception. @@ -112,6 +36,7 @@ strnlen_user_fault:  	{ move r0, zero; jrp lr }  	ENDPROC(strnlen_user_fault)  	.section __ex_table,"a" +	.align 4  	.word 1b, strnlen_user_fault  	.popsection @@ -123,18 +48,20 @@ strnlen_user_fault:   */  STD_ENTRY(strncpy_from_user_asm)  	{ bz r2, 2f; move r3, r0 } -1:      { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } +1:	{ lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }  	{ sb r0, r4; addi r0, r0, 1 } -	bz r2, 2f -	bnzt r4, 1b -	addi r0, r0, -1   /* don't count the trailing NUL */ -2:      { sub r0, r0, r3; jrp lr } +	bz r4, 2f +	bnzt r2, 1b +	{ sub r0, r0, r3; jrp lr } +2:	addi r0, r0, -1   /* don't count the trailing NUL */ +	{ sub r0, r0, r3; jrp lr }  	STD_ENDPROC(strncpy_from_user_asm)  	.pushsection .fixup,"ax"  strncpy_from_user_fault:  	{ movei r0, -EFAULT; jrp lr }  	ENDPROC(strncpy_from_user_fault)  	.section __ex_table,"a" +	.align 4  	.word 1b, strncpy_from_user_fault  	.popsection @@ -153,6 +80,7 @@ STD_ENTRY(clear_user_asm)  	bnzt r1, 1b  2:      { move r0, r1; jrp lr }  	.pushsection __ex_table,"a" +	.align 4  	.word 1b, 2b  	.popsection @@ -162,6 +90,7 @@ STD_ENTRY(clear_user_asm)  2:      { move r0, r1; jrp lr }  	STD_ENDPROC(clear_user_asm)  	.pushsection __ex_table,"a" +	.align 4  	.word 1b, 2b  	.popsection @@ -181,25 +110,7 @@ STD_ENTRY(flush_user_asm)  2:      { move r0, r1; jrp lr }  	STD_ENDPROC(flush_user_asm)  	.pushsection __ex_table,"a" -	.word 1b, 2b -	.popsection - -/* - * inv_user_asm takes the user target address in r0 and the - * number of bytes to invalidate in r1. - * It returns the number of not inv'able bytes (hopefully zero) in r0. - */ -STD_ENTRY(inv_user_asm) -	bz r1, 2f -	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 } -	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } -	{ and r0, r0, r2; and r1, r1, r2 } -	{ sub r1, r1, r0 } -1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } -	{ addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b } -2:      { move r0, r1; jrp lr } -	STD_ENDPROC(inv_user_asm) -	.pushsection __ex_table,"a" +	.align 4  	.word 1b, 2b  	.popsection @@ -219,5 +130,6 @@ STD_ENTRY(finv_user_asm)  2:      { move r0, r1; jrp lr }  	STD_ENDPROC(finv_user_asm)  	.pushsection __ex_table,"a" +	.align 4  	.word 1b, 2b  	.popsection diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S new file mode 100644 index 00000000000..b3b31a3306f --- /dev/null +++ b/arch/tile/lib/usercopy_64.S @@ -0,0 +1,135 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +#include <linux/linkage.h> +#include <asm/errno.h> +#include <asm/cache.h> +#include <arch/chip.h> + +/* Access user memory, but use MMU to avoid propagating kernel exceptions. */ + +/* + * strnlen_user_asm takes the pointer in r0, and the length bound in r1. + * It returns the length, including the terminating NUL, or zero on exception. + * If length is greater than the bound, returns one plus the bound. + */ +STD_ENTRY(strnlen_user_asm) +	{ beqz r1, 2f; addi r3, r0, -1 }  /* bias down to include NUL */ +1:      { ld1u r4, r0; addi r1, r1, -1 } +	beqz r4, 2f +	{ bnezt r1, 1b; addi r0, r0, 1 } +2:      { sub r0, r0, r3; jrp lr } +	STD_ENDPROC(strnlen_user_asm) +	.pushsection .fixup,"ax" +strnlen_user_fault: +	{ move r0, zero; jrp lr } +	ENDPROC(strnlen_user_fault) +	.section __ex_table,"a" +	.align 8 +	.quad 1b, strnlen_user_fault +	.popsection + +/* + * strncpy_from_user_asm takes the kernel target pointer in r0, + * the userspace source pointer in r1, and the length bound (including + * the trailing NUL) in r2.  On success, it returns the string length + * (not including the trailing NUL), or -EFAULT on failure. + */ +STD_ENTRY(strncpy_from_user_asm) +	{ beqz r2, 2f; move r3, r0 } +1:	{ ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } +	{ st1 r0, r4; addi r0, r0, 1 } +	beqz r4, 2f +	bnezt r2, 1b +	{ sub r0, r0, r3; jrp lr } +2:	addi r0, r0, -1   /* don't count the trailing NUL */ +	{ sub r0, r0, r3; jrp lr } +	STD_ENDPROC(strncpy_from_user_asm) +	.pushsection .fixup,"ax" +strncpy_from_user_fault: +	{ movei r0, -EFAULT; jrp lr } +	ENDPROC(strncpy_from_user_fault) +	.section __ex_table,"a" +	.align 8 +	.quad 1b, strncpy_from_user_fault +	.popsection + +/* + * clear_user_asm takes the user target address in r0 and the + * number of bytes to zero in r1. + * It returns the number of uncopiable bytes (hopefully zero) in r0. + * Note that we don't use a separate .fixup section here since we fall + * through into the "fixup" code as the last straight-line bundle anyway. + */ +STD_ENTRY(clear_user_asm) +	{ beqz r1, 2f; or r2, r0, r1 } +	andi r2, r2, 7 +	beqzt r2, .Lclear_aligned_user_asm +1:      { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 } +	bnezt r1, 1b +2:      { move r0, r1; jrp lr } +	.pushsection __ex_table,"a" +	.align 8 +	.quad 1b, 2b +	.popsection + +.Lclear_aligned_user_asm: +1:      { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 } +	bnezt r1, 1b +2:      { move r0, r1; jrp lr } +	STD_ENDPROC(clear_user_asm) +	.pushsection __ex_table,"a" +	.align 8 +	.quad 1b, 2b +	.popsection + +/* + * flush_user_asm takes the user target address in r0 and the + * number of bytes to flush in r1. + * It returns the number of unflushable bytes (hopefully zero) in r0. + */ +STD_ENTRY(flush_user_asm) +	beqz r1, 2f +	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 } +	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } +	{ and r0, r0, r2; and r1, r1, r2 } +	{ sub r1, r1, r0 } +1:      { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } +	{ addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b } +2:      { move r0, r1; jrp lr } +	STD_ENDPROC(flush_user_asm) +	.pushsection __ex_table,"a" +	.align 8 +	.quad 1b, 2b +	.popsection + +/* + * finv_user_asm takes the user target address in r0 and the + * number of bytes to flush-invalidate in r1. + * It returns the number of not finv'able bytes (hopefully zero) in r0. + */ +STD_ENTRY(finv_user_asm) +	beqz r1, 2f +	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 } +	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } +	{ and r0, r0, r2; and r1, r1, r2 } +	{ sub r1, r1, r0 } +1:      { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } +	{ addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b } +2:      { move r0, r1; jrp lr } +	STD_ENDPROC(finv_user_asm) +	.pushsection __ex_table,"a" +	.align 8 +	.quad 1b, 2b +	.popsection  | 
