diff options
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/Makefile | 5 | ||||
-rw-r--r-- | arch/tile/lib/atomic_32.c | 17 | ||||
-rw-r--r-- | arch/tile/lib/atomic_asm_32.S | 2 | ||||
-rw-r--r-- | arch/tile/lib/cacheflush.c | 102 | ||||
-rw-r--r-- | arch/tile/lib/delay.c | 21 | ||||
-rw-r--r-- | arch/tile/lib/exports.c | 10 | ||||
-rw-r--r-- | arch/tile/lib/mb_incoherent.S | 34 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_tile64.c | 4 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_32.c | 161 |
9 files changed, 234 insertions, 122 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile index 93122d5b155..0c26086ecbe 100644 --- a/arch/tile/lib/Makefile +++ b/arch/tile/lib/Makefile @@ -2,9 +2,8 @@ # Makefile for TILE-specific library files.. # -lib-y = cacheflush.o checksum.o cpumask.o delay.o \ - mb_incoherent.o uaccess.o memmove.o \ - memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ +lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \ + memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ strchr_$(BITS).o strlen_$(BITS).o ifeq ($(CONFIG_TILEGX),y) diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 7a5cc706ab6..46570211df5 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE] #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ /* This page is remapped on startup to be hash-for-home. */ -int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */] - __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned"))); +int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ static inline int *__atomic_hashed_lock(volatile void *v) { - /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */ + /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */ #if ATOMIC_LOCKS_FOUND_VIA_TABLE() unsigned long i = (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); @@ -203,32 +202,32 @@ static inline int *__futex_setup(int __user *v) return __atomic_hashed_lock((int __force *)v); } -struct __get_user futex_set(int __user *v, int i) +struct __get_user futex_set(u32 __user *v, int i) { return __atomic_xchg((int __force *)v, __futex_setup(v), i); } -struct __get_user futex_add(int __user *v, int n) +struct __get_user futex_add(u32 __user *v, int n) { return __atomic_xchg_add((int __force *)v, __futex_setup(v), n); } -struct __get_user futex_or(int __user *v, int n) +struct __get_user futex_or(u32 __user *v, int n) { return __atomic_or((int __force *)v, __futex_setup(v), n); } -struct __get_user futex_andn(int __user *v, int n) +struct __get_user futex_andn(u32 __user *v, int n) { return __atomic_andn((int __force *)v, __futex_setup(v), n); } -struct __get_user futex_xor(int __user *v, int n) +struct __get_user futex_xor(u32 __user *v, int n) { return __atomic_xor((int __force *)v, __futex_setup(v), n); } -struct __get_user futex_cmpxchg(int __user *v, int o, int n) +struct __get_user futex_cmpxchg(u32 __user *v, int o, int n) { return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n); } diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 5a5514b77e7..82f64cc6365 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -14,7 +14,7 @@ * Support routines for atomic operations. Each function takes: * * r0: address to manipulate - * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG) + * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG) * r2: new value to write, or for cmpxchg/add_unless, value to compare against * r3: (cmpxchg/xchg_add_unless) new value to write or add; * (atomic64 ops) high word of value to write diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 11b6164c209..35c1d8ca5f3 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c @@ -21,3 +21,105 @@ void __flush_icache_range(unsigned long start, unsigned long end) { invalidate_icache((const void *)start, end - start, PAGE_SIZE); } + + +/* Force a load instruction to issue. */ +static inline void force_load(char *p) +{ + *(volatile char *)p; +} + +/* + * Flush and invalidate a VA range that is homed remotely on a single + * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting + * until the memory controller holds the flushed values. + */ +void finv_buffer_remote(void *buffer, size_t size, int hfh) +{ + char *p, *base; + size_t step_size, load_count; + const unsigned long STRIPE_WIDTH = 8192; + + /* + * Flush and invalidate the buffer out of the local L1/L2 + * and request the home cache to flush and invalidate as well. + */ + __finv_buffer(buffer, size); + + /* + * Wait for the home cache to acknowledge that it has processed + * all the flush-and-invalidate requests. This does not mean + * that the flushed data has reached the memory controller yet, + * but it does mean the home cache is processing the flushes. + */ + __insn_mf(); + + /* + * Issue a load to the last cache line, which can't complete + * until all the previously-issued flushes to the same memory + * controller have also completed. If we weren't striping + * memory, that one load would be sufficient, but since we may + * be, we also need to back up to the last load issued to + * another memory controller, which would be the point where + * we crossed an 8KB boundary (the granularity of striping + * across memory controllers). Keep backing up and doing this + * until we are before the beginning of the buffer, or have + * hit all the controllers. + * + * If we are flushing a hash-for-home buffer, it's even worse. + * Each line may be homed on a different tile, and each tile + * may have up to four lines that are on different + * controllers. So as we walk backwards, we have to touch + * enough cache lines to satisfy these constraints. In + * practice this ends up being close enough to "load from + * every cache line on a full memory stripe on each + * controller" that we simply do that, to simplify the logic. + * + * FIXME: See bug 9535 for some issues with this code. + */ + if (hfh) { + step_size = L2_CACHE_BYTES; + load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) * + (1 << CHIP_LOG_NUM_MSHIMS()); + } else { + step_size = STRIPE_WIDTH; + load_count = (1 << CHIP_LOG_NUM_MSHIMS()); + } + + /* Load the last byte of the buffer. */ + p = (char *)buffer + size - 1; + force_load(p); + + /* Bump down to the end of the previous stripe or cache line. */ + p -= step_size; + p = (char *)((unsigned long)p | (step_size - 1)); + + /* Figure out how far back we need to go. */ + base = p - (step_size * (load_count - 2)); + if ((long)base < (long)buffer) + base = buffer; + + /* + * Fire all the loads we need. The MAF only has eight entries + * so we can have at most eight outstanding loads, so we + * unroll by that amount. + */ +#pragma unroll 8 + for (; p >= base; p -= step_size) + force_load(p); + + /* + * Repeat, but with inv's instead of loads, to get rid of the + * data we just loaded into our own cache and the old home L3. + * No need to unroll since inv's don't target a register. + */ + p = (char *)buffer + size - 1; + __insn_inv(p); + p -= step_size; + p = (char *)((unsigned long)p | (step_size - 1)); + for (; p >= base; p -= step_size) + __insn_inv(p); + + /* Wait for the load+inv's (and thus finvs) to have completed. */ + __insn_mf(); +} diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c index 5801b03c13e..cdacdd11d36 100644 --- a/arch/tile/lib/delay.c +++ b/arch/tile/lib/delay.c @@ -15,20 +15,31 @@ #include <linux/module.h> #include <linux/delay.h> #include <linux/thread_info.h> -#include <asm/fixmap.h> -#include <hv/hypervisor.h> +#include <asm/timex.h> void __udelay(unsigned long usecs) { - hv_nanosleep(usecs * 1000); + if (usecs > ULONG_MAX / 1000) { + WARN_ON_ONCE(usecs > ULONG_MAX / 1000); + usecs = ULONG_MAX / 1000; + } + __ndelay(usecs * 1000); } EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { - hv_nanosleep(nsecs); + cycles_t target = get_cycles(); + target += ns2cycles(nsecs); + while (get_cycles() < target) + cpu_relax(); } EXPORT_SYMBOL(__ndelay); -/* FIXME: should be declared in a header somewhere. */ +void __delay(unsigned long cycles) +{ + cycles_t target = get_cycles() + cycles; + while (get_cycles() < target) + cpu_relax(); +} EXPORT_SYMBOL(__delay); diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c index 1509c559765..49284fae9d0 100644 --- a/arch/tile/lib/exports.c +++ b/arch/tile/lib/exports.c @@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strnlen_user_asm); EXPORT_SYMBOL(strncpy_from_user_asm); EXPORT_SYMBOL(clear_user_asm); +EXPORT_SYMBOL(flush_user_asm); +EXPORT_SYMBOL(inv_user_asm); +EXPORT_SYMBOL(finv_user_asm); /* arch/tile/kernel/entry.S */ #include <linux/kernel.h> @@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing); EXPORT_SYMBOL(__copy_in_user_inatomic); #endif -/* arch/tile/lib/mb_incoherent.S */ -EXPORT_SYMBOL(__mb_incoherent); - /* hypervisor glue */ #include <hv/hypervisor.h> EXPORT_SYMBOL(hv_dev_open); @@ -85,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t); EXPORT_SYMBOL(__muldi3); uint64_t __lshrdi3(uint64_t, unsigned int); EXPORT_SYMBOL(__lshrdi3); +uint64_t __ashrdi3(uint64_t, unsigned int); +EXPORT_SYMBOL(__ashrdi3); +uint64_t __ashldi3(uint64_t, unsigned int); +EXPORT_SYMBOL(__ashldi3); #endif diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S deleted file mode 100644 index 989ad7b68d5..00000000000 --- a/arch/tile/lib/mb_incoherent.S +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * - * Assembly code for invoking the HV's fence_incoherent syscall. - */ - -#include <linux/linkage.h> -#include <hv/syscall_public.h> -#include <arch/abi.h> -#include <arch/chip.h> - -#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS() - -/* - * Invoke the hypervisor's fence_incoherent syscall, which guarantees - * that all victims for cachelines homed on this tile have reached memory. - */ -STD_ENTRY(__mb_incoherent) - moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent - swint2 - jrp lr - STD_ENDPROC(__mb_incoherent) - -#endif diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c index f7d4a6ad61e..b2fe15e0107 100644 --- a/arch/tile/lib/memcpy_tile64.c +++ b/arch/tile/lib/memcpy_tile64.c @@ -96,7 +96,7 @@ static void memcpy_multicache(void *dest, const void *source, newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); ptep = pte_offset_kernel(pmdp, newsrc); - *ptep = src_pte; /* set_pte() would be confused by this */ + __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); /* Actually move the data. */ @@ -109,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source, */ src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ - *ptep = src_pte; /* set_pte() would be confused by this */ + __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); /* diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 5cd1c4004ec..cb0999fb64b 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c @@ -15,6 +15,7 @@ #include <linux/spinlock.h> #include <linux/module.h> #include <asm/processor.h> +#include <arch/spr_def.h> #include "spinlock_common.h" @@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait); #define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1) -/* Lock the word, spinning until there are no tns-ers. */ -static inline u32 get_rwlock(arch_rwlock_t *rwlock) -{ - u32 iterations = 0; - for (;;) { - u32 val = __insn_tns((int *)&rwlock->lock); - if (unlikely(val & 1)) { - delay_backoff(iterations++); - continue; - } - return val; - } -} - -int arch_read_trylock_slow(arch_rwlock_t *rwlock) -{ - u32 val = get_rwlock(rwlock); - int locked = (val << RD_COUNT_WIDTH) == 0; - rwlock->lock = val + (locked << RD_COUNT_SHIFT); - return locked; -} -EXPORT_SYMBOL(arch_read_trylock_slow); - -void arch_read_unlock_slow(arch_rwlock_t *rwlock) -{ - u32 val = get_rwlock(rwlock); - rwlock->lock = val - (1 << RD_COUNT_SHIFT); -} -EXPORT_SYMBOL(arch_read_unlock_slow); - -void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val) +/* + * We can get the read lock if everything but the reader bits (which + * are in the high part of the word) is zero, i.e. no active or + * waiting writers, no tns. + * + * We guard the tns/store-back with an interrupt critical section to + * preserve the semantic that the same read lock can be acquired in an + * interrupt context. + */ +inline int arch_read_trylock(arch_rwlock_t *rwlock) { - u32 eq, mask = 1 << WR_CURR_SHIFT; - while (unlikely(val & 1)) { - /* Limited backoff since we are the highest-priority task. */ - relax(4); - val = __insn_tns((int *)&rwlock->lock); + u32 val; + __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); + val = __insn_tns((int *)&rwlock->lock); + if (likely((val << _RD_COUNT_WIDTH) == 0)) { + val += 1 << RD_COUNT_SHIFT; + rwlock->lock = val; + __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); + BUG_ON(val == 0); /* we don't expect wraparound */ + return 1; } - val = __insn_addb(val, mask); - eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); - val = __insn_mz(eq & mask, val); - rwlock->lock = val; + if ((val & 1) == 0) + rwlock->lock = val; + __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); + return 0; } -EXPORT_SYMBOL(arch_write_unlock_slow); +EXPORT_SYMBOL(arch_read_trylock); /* - * We spin until everything but the reader bits (which are in the high - * part of the word) are zero, i.e. no active or waiting writers, no tns. - * + * Spin doing arch_read_trylock() until we acquire the lock. * ISSUE: This approach can permanently starve readers. A reader who sees * a writer could instead take a ticket lock (just like a writer would), * and atomically enter read mode (with 1 reader) when it gets the ticket. - * This way both readers and writers will always make forward progress + * This way both readers and writers would always make forward progress * in a finite time. */ -void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val) +void arch_read_lock(arch_rwlock_t *rwlock) { u32 iterations = 0; - do { - if (!(val & 1)) - rwlock->lock = val; + while (unlikely(!arch_read_trylock(rwlock))) delay_backoff(iterations++); +} +EXPORT_SYMBOL(arch_read_lock); + +void arch_read_unlock(arch_rwlock_t *rwlock) +{ + u32 val, iterations = 0; + + mb(); /* guarantee anything modified under the lock is visible */ + for (;;) { + __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); val = __insn_tns((int *)&rwlock->lock); - } while ((val << RD_COUNT_WIDTH) != 0); - rwlock->lock = val + (1 << RD_COUNT_SHIFT); + if (likely(val & 1) == 0) { + rwlock->lock = val - (1 << _RD_COUNT_SHIFT); + __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); + break; + } + __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); + delay_backoff(iterations++); + } } -EXPORT_SYMBOL(arch_read_lock_slow); +EXPORT_SYMBOL(arch_read_unlock); -void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) +/* + * We don't need an interrupt critical section here (unlike for + * arch_read_lock) since we should never use a bare write lock where + * it could be interrupted by code that could try to re-acquire it. + */ +void arch_write_lock(arch_rwlock_t *rwlock) { /* * The trailing underscore on this variable (and curr_ below) @@ -168,6 +169,12 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) */ u32 my_ticket_; u32 iterations = 0; + u32 val = __insn_tns((int *)&rwlock->lock); + + if (likely(val == 0)) { + rwlock->lock = 1 << _WR_NEXT_SHIFT; + return; + } /* * Wait until there are no readers, then bump up the next @@ -206,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) relax(4); } } -EXPORT_SYMBOL(arch_write_lock_slow); +EXPORT_SYMBOL(arch_write_lock); -int __tns_atomic_acquire(atomic_t *lock) +int arch_write_trylock(arch_rwlock_t *rwlock) { - int ret; - u32 iterations = 0; + u32 val = __insn_tns((int *)&rwlock->lock); - BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION)); - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); + /* + * If a tns is in progress, or there's a waiting or active locker, + * or active readers, we can't take the lock, so give up. + */ + if (unlikely(val != 0)) { + if (!(val & 1)) + rwlock->lock = val; + return 0; + } - while ((ret = __insn_tns((void *)&lock->counter)) == 1) - delay_backoff(iterations++); - return ret; + /* Set the "next" field to mark it locked. */ + rwlock->lock = 1 << _WR_NEXT_SHIFT; + return 1; } +EXPORT_SYMBOL(arch_write_trylock); -void __tns_atomic_release(atomic_t *p, int v) +void arch_write_unlock(arch_rwlock_t *rwlock) { - p->counter = v; - __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); + u32 val, eq, mask; + + mb(); /* guarantee anything modified under the lock is visible */ + val = __insn_tns((int *)&rwlock->lock); + if (likely(val == (1 << _WR_NEXT_SHIFT))) { + rwlock->lock = 0; + return; + } + while (unlikely(val & 1)) { + /* Limited backoff since we are the highest-priority task. */ + relax(4); + val = __insn_tns((int *)&rwlock->lock); + } + mask = 1 << WR_CURR_SHIFT; + val = __insn_addb(val, mask); + eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); + val = __insn_mz(eq & mask, val); + rwlock->lock = val; } +EXPORT_SYMBOL(arch_write_unlock); |