diff options
Diffstat (limited to 'arch/tile/kernel')
59 files changed, 10164 insertions, 2213 deletions
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile index b4dbc057baa..21f77bf68c6 100644 --- a/arch/tile/kernel/Makefile +++ b/arch/tile/kernel/Makefile @@ -3,16 +3,34 @@ # extra-y := vmlinux.lds head_$(BITS).o -obj-y := backtrace.o entry.o init_task.o irq.o messaging.o \ +obj-y := backtrace.o entry.o hvglue.o irq.o messaging.o \ pci-dma.o proc.o process.o ptrace.o reboot.o \ - setup.o signal.o single_step.o stack.o sys.o sysfs.o time.o traps.o \ + setup.o signal.o single_step.o stack.o sys.o \ + sysfs.o time.o traps.o unaligned.o vdso.o \ intvec_$(BITS).o regs_$(BITS).o tile-desc_$(BITS).o +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg +endif + obj-$(CONFIG_HARDWALL) += hardwall.o -obj-$(CONFIG_TILEGX) += futex_64.o obj-$(CONFIG_COMPAT) += compat.o compat_signal.o obj-$(CONFIG_SMP) += smpboot.o smp.o tlb.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel_$(BITS).o +ifdef CONFIG_TILEGX +obj-$(CONFIG_PCI) += pci_gx.o +else obj-$(CONFIG_PCI) += pci.o +endif +obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_USE_PMC) += pmc.o +obj-$(CONFIG_TILE_USB) += usb.o +obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KGDB) += kgdb.o + +obj-y += vdso/ diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c index 01ddf19cc36..375e7c321ee 100644 --- a/arch/tile/kernel/asm-offsets.c +++ b/arch/tile/kernel/asm-offsets.c @@ -14,13 +14,6 @@ * Generates definitions from c-type structures used by assembly sources. */ -#include <linux/kbuild.h> -#include <linux/thread_info.h> -#include <linux/sched.h> -#include <linux/hardirq.h> -#include <linux/ptrace.h> -#include <hv/hypervisor.h> - /* Check for compatible compiler early in the build. */ #ifdef CONFIG_TILEGX # ifndef __tilegx__ @@ -31,46 +24,61 @@ # endif #else # ifdef __tilegx__ -# error Can not build TILEPro/TILE64 configurations with tilegx compiler +# error Can not build TILEPro configurations with tilegx compiler # endif #endif +#include <linux/kbuild.h> +#include <linux/thread_info.h> +#include <linux/sched.h> +#include <linux/hardirq.h> +#include <linux/ptrace.h> +#include <hv/hypervisor.h> + void foo(void) { - DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET, \ + DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET, offsetof(struct single_step_state, buffer)); - DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET, \ + DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET, offsetof(struct single_step_state, flags)); - DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET, \ + DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET, offsetof(struct single_step_state, orig_pc)); - DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET, \ + DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET, offsetof(struct single_step_state, next_pc)); - DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET, \ + DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET, offsetof(struct single_step_state, branch_next_pc)); - DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET, \ + DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET, offsetof(struct single_step_state, update_value)); - DEFINE(THREAD_INFO_TASK_OFFSET, \ + DEFINE(THREAD_INFO_TASK_OFFSET, offsetof(struct thread_info, task)); - DEFINE(THREAD_INFO_FLAGS_OFFSET, \ + DEFINE(THREAD_INFO_FLAGS_OFFSET, offsetof(struct thread_info, flags)); - DEFINE(THREAD_INFO_STATUS_OFFSET, \ + DEFINE(THREAD_INFO_STATUS_OFFSET, offsetof(struct thread_info, status)); - DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET, \ + DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET, offsetof(struct thread_info, homecache_cpu)); - DEFINE(THREAD_INFO_STEP_STATE_OFFSET, \ + DEFINE(THREAD_INFO_PREEMPT_COUNT_OFFSET, + offsetof(struct thread_info, preempt_count)); + DEFINE(THREAD_INFO_STEP_STATE_OFFSET, offsetof(struct thread_info, step_state)); +#ifdef __tilegx__ + DEFINE(THREAD_INFO_UNALIGN_JIT_BASE_OFFSET, + offsetof(struct thread_info, unalign_jit_base)); + DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET, + offsetof(struct thread_info, unalign_jit_tmp)); +#endif DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET, offsetof(struct task_struct, thread.ksp)); DEFINE(TASK_STRUCT_THREAD_PC_OFFSET, offsetof(struct task_struct, thread.pc)); - DEFINE(HV_TOPOLOGY_WIDTH_OFFSET, \ + DEFINE(HV_TOPOLOGY_WIDTH_OFFSET, offsetof(HV_Topology, width)); - DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET, \ + DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET, offsetof(HV_Topology, height)); - DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET, \ + DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET, offsetof(irq_cpustat_t, irq_syscall_count)); } diff --git a/arch/tile/kernel/backtrace.c b/arch/tile/kernel/backtrace.c index 9092ce8aa6b..f8b74ca83b9 100644 --- a/arch/tile/kernel/backtrace.c +++ b/arch/tile/kernel/backtrace.c @@ -14,6 +14,7 @@ #include <linux/kernel.h> #include <linux/string.h> +#include <asm/byteorder.h> #include <asm/backtrace.h> #include <asm/tile-desc.h> #include <arch/abi.h> @@ -336,8 +337,12 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location, bytes_to_prefetch / sizeof(tile_bundle_bits); } - /* Decode the next bundle. */ - bundle.bits = prefetched_bundles[next_bundle++]; + /* + * Decode the next bundle. + * TILE always stores instruction bundles in little-endian + * mode, even when the chip is running in big-endian mode. + */ + bundle.bits = le64_to_cpu(prefetched_bundles[next_bundle++]); bundle.num_insns = parse_insn_tile(bundle.bits, pc, bundle.insns); num_info_ops = bt_get_info_ops(&bundle, info_operands); diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c index bf5e9d70266..49120843ff9 100644 --- a/arch/tile/kernel/compat.c +++ b/arch/tile/kernel/compat.c @@ -16,7 +16,6 @@ #define __SYSCALL_COMPAT #include <linux/compat.h> -#include <linux/msg.h> #include <linux/syscalls.h> #include <linux/kdev_t.h> #include <linux/fs.h> @@ -33,121 +32,69 @@ * adapt the usual convention. */ -long compat_sys_truncate64(char __user *filename, u32 dummy, u32 low, u32 high) +COMPAT_SYSCALL_DEFINE4(truncate64, char __user *, filename, u32, dummy, + u32, low, u32, high) { return sys_truncate(filename, ((loff_t)high << 32) | low); } -long compat_sys_ftruncate64(unsigned int fd, u32 dummy, u32 low, u32 high) +COMPAT_SYSCALL_DEFINE4(ftruncate64, unsigned int, fd, u32, dummy, + u32, low, u32, high) { return sys_ftruncate(fd, ((loff_t)high << 32) | low); } -long compat_sys_pread64(unsigned int fd, char __user *ubuf, size_t count, - u32 dummy, u32 low, u32 high) +COMPAT_SYSCALL_DEFINE6(pread64, unsigned int, fd, char __user *, ubuf, + size_t, count, u32, dummy, u32, low, u32, high) { return sys_pread64(fd, ubuf, count, ((loff_t)high << 32) | low); } -long compat_sys_pwrite64(unsigned int fd, char __user *ubuf, size_t count, - u32 dummy, u32 low, u32 high) +COMPAT_SYSCALL_DEFINE6(pwrite64, unsigned int, fd, char __user *, ubuf, + size_t, count, u32, dummy, u32, low, u32, high) { return sys_pwrite64(fd, ubuf, count, ((loff_t)high << 32) | low); } -long compat_sys_lookup_dcookie(u32 low, u32 high, char __user *buf, size_t len) -{ - return sys_lookup_dcookie(((loff_t)high << 32) | low, buf, len); -} - -long compat_sys_sync_file_range2(int fd, unsigned int flags, - u32 offset_lo, u32 offset_hi, - u32 nbytes_lo, u32 nbytes_hi) +COMPAT_SYSCALL_DEFINE6(sync_file_range2, int, fd, unsigned int, flags, + u32, offset_lo, u32, offset_hi, + u32, nbytes_lo, u32, nbytes_hi) { return sys_sync_file_range(fd, ((loff_t)offset_hi << 32) | offset_lo, ((loff_t)nbytes_hi << 32) | nbytes_lo, flags); } -long compat_sys_fallocate(int fd, int mode, - u32 offset_lo, u32 offset_hi, - u32 len_lo, u32 len_hi) +COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, + u32, offset_lo, u32, offset_hi, + u32, len_lo, u32, len_hi) { return sys_fallocate(fd, mode, ((loff_t)offset_hi << 32) | offset_lo, ((loff_t)len_hi << 32) | len_lo); } - - -long compat_sys_sched_rr_get_interval(compat_pid_t pid, - struct compat_timespec __user *interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, - (struct timespec __force __user *)&t); - set_fs(old_fs); - if (put_compat_timespec(&t, interval)) - return -EFAULT; - return ret; -} - /* - * The usual compat_sys_msgsnd() and _msgrcv() seem to be assuming - * some different calling convention than our normal 32-bit tile code. + * Avoid bug in generic sys_llseek() that specifies offset_high and + * offset_low as "unsigned long", thus making it possible to pass + * a sign-extended high 32 bits in offset_low. */ - -/* Already defined in ipc/compat.c, but we need it here. */ -struct compat_msgbuf { - compat_long_t mtype; - char mtext[1]; -}; - -long tile_compat_sys_msgsnd(int msqid, - struct compat_msgbuf __user *msgp, - size_t msgsz, int msgflg) +COMPAT_SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned int, offset_high, + unsigned int, offset_low, loff_t __user *, result, + unsigned int, origin) { - compat_long_t mtype; - - if (get_user(mtype, &msgp->mtype)) - return -EFAULT; - return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg); -} - -long tile_compat_sys_msgrcv(int msqid, - struct compat_msgbuf __user *msgp, - size_t msgsz, long msgtyp, int msgflg) -{ - long err, mtype; - - err = do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); - if (err < 0) - goto out; - - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; - out: - return err; + return sys_llseek(fd, offset_high, offset_low, result, origin); } /* Provide the compat syscall number to call mapping. */ #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), -/* The generic versions of these don't work for Tile. */ -#define compat_sys_msgrcv tile_compat_sys_msgrcv -#define compat_sys_msgsnd tile_compat_sys_msgsnd - /* See comments in sys.c */ #define compat_sys_fadvise64_64 sys32_fadvise64_64 #define compat_sys_readahead sys32_readahead +#define sys_llseek compat_sys_llseek -/* Call the trampolines to manage pt_regs where necessary. */ -#define compat_sys_execve _compat_sys_execve -#define compat_sys_sigaltstack _compat_sys_sigaltstack +/* Call the assembly trampolines where necessary. */ #define compat_sys_rt_sigreturn _compat_sys_rt_sigreturn #define sys_clone _sys_clone diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index a7869ad6277..19c04b5ce40 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -32,21 +32,9 @@ #include <asm/ucontext.h> #include <asm/sigframe.h> #include <asm/syscalls.h> +#include <asm/vdso.h> #include <arch/interrupts.h> -struct compat_sigaction { - compat_uptr_t sa_handler; - compat_ulong_t sa_flags; - compat_uptr_t sa_restorer; - sigset_t sa_mask __packed; -}; - -struct compat_sigaltstack { - compat_uptr_t ss_sp; - int ss_flags; - compat_size_t ss_size; -}; - struct compat_ucontext { compat_ulong_t uc_flags; compat_uptr_t uc_link; @@ -55,129 +43,13 @@ struct compat_ucontext { sigset_t uc_sigmask; /* mask last for extensibility */ }; -#define COMPAT_SI_PAD_SIZE ((SI_MAX_SIZE - 3 * sizeof(int)) / sizeof(int)) - -struct compat_siginfo { - int si_signo; - int si_errno; - int si_code; - - union { - int _pad[COMPAT_SI_PAD_SIZE]; - - /* kill() */ - struct { - unsigned int _pid; /* sender's pid */ - unsigned int _uid; /* sender's uid */ - } _kill; - - /* POSIX.1b timers */ - struct { - compat_timer_t _tid; /* timer id */ - int _overrun; /* overrun count */ - compat_sigval_t _sigval; /* same as below */ - int _sys_private; /* not to be passed to user */ - int _overrun_incr; /* amount to add to overrun */ - } _timer; - - /* POSIX.1b signals */ - struct { - unsigned int _pid; /* sender's pid */ - unsigned int _uid; /* sender's uid */ - compat_sigval_t _sigval; - } _rt; - - /* SIGCHLD */ - struct { - unsigned int _pid; /* which child */ - unsigned int _uid; /* sender's uid */ - int _status; /* exit code */ - compat_clock_t _utime; - compat_clock_t _stime; - } _sigchld; - - /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ - struct { - unsigned int _addr; /* faulting insn/memory ref. */ -#ifdef __ARCH_SI_TRAPNO - int _trapno; /* TRAP # which caused the signal */ -#endif - } _sigfault; - - /* SIGPOLL */ - struct { - int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ - int _fd; - } _sigpoll; - } _sifields; -}; - struct compat_rt_sigframe { unsigned char save_area[C_ABI_SAVE_AREA_SIZE]; /* caller save area */ struct compat_siginfo info; struct compat_ucontext uc; }; -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act, - struct compat_sigaction __user *oact, - size_t sigsetsize) -{ - struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - goto out; - - if (act) { - compat_uptr_t handler, restorer; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(handler, &act->sa_handler) || - __get_user(new_sa.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer) || - __copy_from_user(&new_sa.sa.sa_mask, &act->sa_mask, - sizeof(sigset_t))) - return -EFAULT; - new_sa.sa.sa_handler = compat_ptr(handler); - new_sa.sa.sa_restorer = compat_ptr(restorer); - } - - ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_sa.sa.sa_handler), - &oact->sa_handler) || - __put_user(ptr_to_compat(old_sa.sa.sa_restorer), - &oact->sa_restorer) || - __put_user(old_sa.sa.sa_flags, &oact->sa_flags) || - __copy_to_user(&oact->sa_mask, &old_sa.sa.sa_mask, - sizeof(sigset_t))) - return -EFAULT; - } -out: - return ret; -} - -long compat_sys_rt_sigqueueinfo(int pid, int sig, - struct compat_siginfo __user *uinfo) -{ - siginfo_t info; - int ret; - mm_segment_t old_fs = get_fs(); - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - set_fs(KERNEL_DS); - ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __force __user *)&info); - set_fs(old_fs); - return ret; -} - -int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from) +int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *from) { int err; @@ -255,44 +127,10 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) return err; } -long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr, - struct compat_sigaltstack __user *uoss_ptr, - struct pt_regs *regs) -{ - stack_t uss, uoss; - int ret; - mm_segment_t seg; - - if (uss_ptr) { - u32 ptr; - - memset(&uss, 0, sizeof(stack_t)); - if (!access_ok(VERIFY_READ, uss_ptr, sizeof(*uss_ptr)) || - __get_user(ptr, &uss_ptr->ss_sp) || - __get_user(uss.ss_flags, &uss_ptr->ss_flags) || - __get_user(uss.ss_size, &uss_ptr->ss_size)) - return -EFAULT; - uss.ss_sp = compat_ptr(ptr); - } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack(uss_ptr ? (stack_t __user __force *)&uss : NULL, - (stack_t __user __force *)&uoss, - (unsigned long)compat_ptr(regs->sp)); - set_fs(seg); - if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(*uoss_ptr)) || - __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || - __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || - __put_user(uoss.ss_size, &uoss_ptr->ss_size)) - ret = -EFAULT; - } - return ret; -} - /* The assembly shim for this function arranges to ignore the return value. */ -long compat_sys_rt_sigreturn(struct pt_regs *regs) +long compat_sys_rt_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct compat_rt_sigframe __user *frame = (struct compat_rt_sigframe __user *) compat_ptr(regs->sp); sigset_t set; @@ -302,16 +140,12 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; - if (compat_sys_sigaltstack(&frame->uc.uc_stack, NULL, regs) != 0) + if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return 0; @@ -388,17 +222,13 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __clear_user(&frame->save_area, sizeof(frame->save_area)); err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(ptr_to_compat((void *)(current->sas_ss_sp)), - &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto give_sigsegv; - restorer = VDSO_BASE; + restorer = VDSO_SYM(&__vdso_rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ptr_to_compat_reg(ka->sa.sa_restorer); @@ -406,28 +236,17 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * Set up registers for signal handler. * Registers that we don't modify keep the value they had from * user-space at the time we took the signal. + * We always pass siginfo and mcontext, regardless of SA_SIGINFO, + * since some things rely on this (e.g. glibc's debug/segfault.c). */ regs->pc = ptr_to_compat_reg(ka->sa.sa_handler); regs->ex1 = PL_ICS_EX1(USER_PL, 1); /* set crit sec in handler */ regs->sp = ptr_to_compat_reg(frame); regs->lr = restorer; regs->regs[0] = (unsigned long) usig; - - if (ka->sa.sa_flags & SA_SIGINFO) { - /* Need extra arguments, so mark to restore caller-saves. */ - regs->regs[1] = ptr_to_compat_reg(&frame->info); - regs->regs[2] = ptr_to_compat_reg(&frame->uc); - regs->flags |= PT_FLAGS_CALLER_SAVES; - } - - /* - * Notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - + regs->regs[1] = ptr_to_compat_reg(&frame->info); + regs->regs[2] = ptr_to_compat_reg(&frame->uc); + regs->flags |= PT_FLAGS_CALLER_SAVES; return 0; give_sigsegv: diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c index 493a0e66d91..b608e00e7f6 100644 --- a/arch/tile/kernel/early_printk.c +++ b/arch/tile/kernel/early_printk.c @@ -16,41 +16,31 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/irqflags.h> +#include <linux/printk.h> #include <asm/setup.h> #include <hv/hypervisor.h> static void early_hv_write(struct console *con, const char *s, unsigned n) { - hv_console_write((HV_VirtAddr) s, n); + tile_console_write(s, n); + + /* + * Convert NL to NLCR (close enough to CRNL) during early boot. + * We assume newlines are at the ends of strings, which turns out + * to be good enough for early boot console output. + */ + if (n && s[n-1] == '\n') + tile_console_write("\r", 1); } static struct console early_hv_console = { .name = "earlyhv", .write = early_hv_write, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_BOOT, .index = -1, }; -/* Direct interface for emergencies */ -static struct console *early_console = &early_hv_console; -static int early_console_initialized; -static int early_console_complete; - -static void early_vprintk(const char *fmt, va_list ap) -{ - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - early_console->write(early_console, buf, n); -} - -void early_printk(const char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - early_vprintk(fmt, ap); - va_end(ap); -} - void early_panic(const char *fmt, ...) { va_list ap; @@ -58,52 +48,21 @@ void early_panic(const char *fmt, ...) va_start(ap, fmt); early_printk("Kernel panic - not syncing: "); early_vprintk(fmt, ap); - early_console->write(early_console, "\n", 1); + early_printk("\n"); va_end(ap); dump_stack(); hv_halt(); } -static int __initdata keep_early; - static int __init setup_early_printk(char *str) { - if (early_console_initialized) + if (early_console) return 1; - if (str != NULL && strncmp(str, "keep", 4) == 0) - keep_early = 1; - early_console = &early_hv_console; - early_console_initialized = 1; register_console(early_console); return 0; } -void __init disable_early_printk(void) -{ - early_console_complete = 1; - if (!early_console_initialized || !early_console) - return; - if (!keep_early) { - early_printk("disabling early console\n"); - unregister_console(early_console); - early_console_initialized = 0; - } else { - early_printk("keeping early console\n"); - } -} - -void warn_early_printk(void) -{ - if (early_console_complete || early_console_initialized) - return; - early_printk("\ -Machine shutting down before console output is fully initialized.\n\ -You may wish to reboot and add the option 'earlyprintk' to your\n\ -boot command line to see any diagnostic early console output.\n\ -"); -} - early_param("earlyprintk", setup_early_printk); diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S index 431e9ae6048..3d9175992a2 100644 --- a/arch/tile/kernel/entry.S +++ b/arch/tile/kernel/entry.S @@ -27,33 +27,6 @@ STD_ENTRY(current_text_addr) { move r0, lr; jrp lr } STD_ENDPROC(current_text_addr) -/* - * Implement execve(). The i386 code has a note that forking from kernel - * space results in no copy on write until the execve, so we should be - * careful not to write to the stack here. - */ -STD_ENTRY(kernel_execve) - moveli TREG_SYSCALL_NR_NAME, __NR_execve - swint1 - jrp lr - STD_ENDPROC(kernel_execve) - -/* - * We don't run this function directly, but instead copy it to a page - * we map into every user process. See vdso_setup(). - * - * Note that libc has a copy of this function that it uses to compare - * against the PC when a stack backtrace ends, so if this code is - * changed, the libc implementation(s) should also be updated. - */ - .pushsection .data -ENTRY(__rt_sigreturn) - moveli TREG_SYSCALL_NR_NAME,__NR_rt_sigreturn - swint1 - ENDPROC(__rt_sigreturn) - ENTRY(__rt_sigreturn_end) - .popsection - STD_ENTRY(dump_stack) { move r2, lr; lnk r1 } { move r4, r52; addli r1, r1, dump_stack - . } @@ -68,23 +41,10 @@ STD_ENTRY(KBacktraceIterator_init_current) jrp lr /* keep backtracer happy */ STD_ENDPROC(KBacktraceIterator_init_current) -/* - * Reset our stack to r1/r2 (sp and ksp0+cpu respectively), then - * free the old stack (passed in r0) and re-invoke cpu_idle(). - * We update sp and ksp0 simultaneously to avoid backtracer warnings. - */ -STD_ENTRY(cpu_idle_on_new_stack) - { - move sp, r1 - mtspr SPR_SYSTEM_SAVE_K_0, r2 - } - jal free_thread_info - j cpu_idle - STD_ENDPROC(cpu_idle_on_new_stack) - /* Loop forever on a nap during SMP boot. */ STD_ENTRY(smp_nap) nap + nop /* avoid provoking the icache prefetch with a jump */ j smp_nap /* we are not architecturally guaranteed not to exit nap */ jrp lr /* clue in the backtracer */ STD_ENDPROC(smp_nap) @@ -99,11 +59,13 @@ STD_ENTRY(smp_nap) */ STD_ENTRY(_cpu_idle) movei r1, 1 + IRQ_ENABLE_LOAD(r2, r3) mtspr INTERRUPT_CRITICAL_SECTION, r1 - IRQ_ENABLE(r2, r3) /* unmask, but still with ICS set */ + IRQ_ENABLE_APPLY(r2, r3) /* unmask, but still with ICS set */ mtspr INTERRUPT_CRITICAL_SECTION, zero .global _cpu_idle_nap _cpu_idle_nap: nap + nop /* avoid provoking the icache prefetch with a jump */ jrp lr STD_ENDPROC(_cpu_idle) diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c new file mode 100644 index 00000000000..8d52d83cc51 --- /dev/null +++ b/arch/tile/kernel/ftrace.c @@ -0,0 +1,244 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * TILE-Gx specific ftrace support + */ + +#include <linux/ftrace.h> +#include <linux/uaccess.h> + +#include <asm/cacheflush.h> +#include <asm/ftrace.h> +#include <asm/sections.h> + +#include <arch/opcode.h> + +#ifdef CONFIG_DYNAMIC_FTRACE + +static inline tilegx_bundle_bits NOP(void) +{ + return create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) | + create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) | + create_Opcode_X0(RRR_0_OPCODE_X0) | + create_UnaryOpcodeExtension_X1(NOP_UNARY_OPCODE_X1) | + create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) | + create_Opcode_X1(RRR_0_OPCODE_X1); +} + +static int machine_stopped __read_mostly; + +int ftrace_arch_code_modify_prepare(void) +{ + machine_stopped = 1; + return 0; +} + +int ftrace_arch_code_modify_post_process(void) +{ + flush_icache_range(0, CHIP_L1I_CACHE_SIZE()); + machine_stopped = 0; + return 0; +} + +/* + * Put { move r10, lr; jal ftrace_caller } in a bundle, this lets dynamic + * tracer just add one cycle overhead to every kernel function when disabled. + */ +static unsigned long ftrace_gen_branch(unsigned long pc, unsigned long addr, + bool link) +{ + tilegx_bundle_bits opcode_x0, opcode_x1; + long pcrel_by_instr = (addr - pc) >> TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES; + + if (link) { + /* opcode: jal addr */ + opcode_x1 = + create_Opcode_X1(JUMP_OPCODE_X1) | + create_JumpOpcodeExtension_X1(JAL_JUMP_OPCODE_X1) | + create_JumpOff_X1(pcrel_by_instr); + } else { + /* opcode: j addr */ + opcode_x1 = + create_Opcode_X1(JUMP_OPCODE_X1) | + create_JumpOpcodeExtension_X1(J_JUMP_OPCODE_X1) | + create_JumpOff_X1(pcrel_by_instr); + } + + if (addr == FTRACE_ADDR) { + /* opcode: or r10, lr, zero */ + opcode_x0 = + create_Dest_X0(10) | + create_SrcA_X0(TREG_LR) | + create_SrcB_X0(TREG_ZERO) | + create_RRROpcodeExtension_X0(OR_RRR_0_OPCODE_X0) | + create_Opcode_X0(RRR_0_OPCODE_X0); + } else { + /* opcode: fnop */ + opcode_x0 = + create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) | + create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) | + create_Opcode_X0(RRR_0_OPCODE_X0); + } + + return opcode_x1 | opcode_x0; +} + +static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec) +{ + return NOP(); +} + +static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr) +{ + return ftrace_gen_branch(pc, addr, true); +} + +static int ftrace_modify_code(unsigned long pc, unsigned long old, + unsigned long new) +{ + unsigned long pc_wr; + + /* Check if the address is in kernel text space and module space. */ + if (!kernel_text_address(pc)) + return -EINVAL; + + /* Operate on writable kernel text mapping. */ + pc_wr = pc - MEM_SV_START + PAGE_OFFSET; + + if (probe_kernel_write((void *)pc_wr, &new, MCOUNT_INSN_SIZE)) + return -EPERM; + + smp_wmb(); + + if (!machine_stopped && num_online_cpus() > 1) + flush_icache_range(pc, pc + MCOUNT_INSN_SIZE); + + return 0; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long pc, old; + unsigned long new; + int ret; + + pc = (unsigned long)&ftrace_call; + memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(pc, (unsigned long)func); + + ret = ftrace_modify_code(pc, old, new); + + return ret; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long new, old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(rec); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned long old; + unsigned long new; + int ret; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(rec); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer) +{ + unsigned long return_hooker = (unsigned long) &return_to_handler; + struct ftrace_graph_ent trace; + unsigned long old; + int err; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + old = *parent; + *parent = return_hooker; + + err = ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer); + if (err == -EBUSY) { + *parent = old; + return; + } + + trace.func = self_addr; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } +} + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_graph_call; + +static int __ftrace_modify_caller(unsigned long *callsite, + void (*func) (void), bool enable) +{ + unsigned long caller_fn = (unsigned long) func; + unsigned long pc = (unsigned long) callsite; + unsigned long branch = ftrace_gen_branch(pc, caller_fn, false); + unsigned long nop = NOP(); + unsigned long old = enable ? nop : branch; + unsigned long new = enable ? branch : nop; + + return ftrace_modify_code(pc, old, new); +} + +static int ftrace_modify_graph_caller(bool enable) +{ + int ret; + + ret = __ftrace_modify_caller(&ftrace_graph_call, + ftrace_graph_caller, + enable); + + return ret; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + return ftrace_modify_graph_caller(true); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + return ftrace_modify_graph_caller(false); +} +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/tile/kernel/futex_64.S b/arch/tile/kernel/futex_64.S deleted file mode 100644 index f465d1eda20..00000000000 --- a/arch/tile/kernel/futex_64.S +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - * - * Atomically access user memory, but use MMU to avoid propagating - * kernel exceptions. - */ - -#include <linux/linkage.h> -#include <asm/errno.h> -#include <asm/futex.h> -#include <asm/page.h> -#include <asm/processor.h> - -/* - * Provide a set of atomic memory operations supporting <asm/futex.h>. - * - * r0: user address to manipulate - * r1: new value to write, or for cmpxchg, old value to compare against - * r2: (cmpxchg only) new value to write - * - * Return __get_user struct, r0 with value, r1 with error. - */ -#define FUTEX_OP(name, ...) \ -STD_ENTRY(futex_##name) \ - __VA_ARGS__; \ - { \ - move r1, zero; \ - jrp lr \ - }; \ - STD_ENDPROC(futex_##name); \ - .pushsection __ex_table,"a"; \ - .quad 1b, get_user_fault; \ - .popsection - - .pushsection .fixup,"ax" -get_user_fault: - { movei r1, -EFAULT; jrp lr } - ENDPROC(get_user_fault) - .popsection - -FUTEX_OP(cmpxchg, mtspr CMPEXCH_VALUE, r1; 1: cmpexch4 r0, r0, r2) -FUTEX_OP(set, 1: exch4 r0, r0, r1) -FUTEX_OP(add, 1: fetchadd4 r0, r0, r1) -FUTEX_OP(or, 1: fetchor4 r0, r0, r1) -FUTEX_OP(andn, nor r1, r1, zero; 1: fetchand4 r0, r0, r1) diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c index 8c41891aab3..531f4c36535 100644 --- a/arch/tile/kernel/hardwall.c +++ b/arch/tile/kernel/hardwall.c @@ -33,59 +33,157 @@ /* - * This data structure tracks the rectangle data, etc., associated - * one-to-one with a "struct file *" from opening HARDWALL_FILE. + * Implement a per-cpu "hardwall" resource class such as UDN or IPI. + * We use "hardwall" nomenclature throughout for historical reasons. + * The lock here controls access to the list data structure as well as + * to the items on the list. + */ +struct hardwall_type { + int index; + int is_xdn; + int is_idn; + int disabled; + const char *name; + struct list_head list; + spinlock_t lock; + struct proc_dir_entry *proc_dir; +}; + +enum hardwall_index { + HARDWALL_UDN = 0, +#ifndef __tilepro__ + HARDWALL_IDN = 1, + HARDWALL_IPI = 2, +#endif + _HARDWALL_TYPES +}; + +static struct hardwall_type hardwall_types[] = { + { /* user-space access to UDN */ + 0, + 1, + 0, + 0, + "udn", + LIST_HEAD_INIT(hardwall_types[HARDWALL_UDN].list), + __SPIN_LOCK_UNLOCKED(hardwall_types[HARDWALL_UDN].lock), + NULL + }, +#ifndef __tilepro__ + { /* user-space access to IDN */ + 1, + 1, + 1, + 1, /* disabled pending hypervisor support */ + "idn", + LIST_HEAD_INIT(hardwall_types[HARDWALL_IDN].list), + __SPIN_LOCK_UNLOCKED(hardwall_types[HARDWALL_IDN].lock), + NULL + }, + { /* access to user-space IPI */ + 2, + 0, + 0, + 0, + "ipi", + LIST_HEAD_INIT(hardwall_types[HARDWALL_IPI].list), + __SPIN_LOCK_UNLOCKED(hardwall_types[HARDWALL_IPI].lock), + NULL + }, +#endif +}; + +/* + * This data structure tracks the cpu data, etc., associated + * one-to-one with a "struct file *" from opening a hardwall device file. * Note that the file's private data points back to this structure. */ struct hardwall_info { - struct list_head list; /* "rectangles" list */ + struct list_head list; /* for hardwall_types.list */ struct list_head task_head; /* head of tasks in this hardwall */ - struct cpumask cpumask; /* cpus in the rectangle */ + struct hardwall_type *type; /* type of this resource */ + struct cpumask cpumask; /* cpus reserved */ + int id; /* integer id for this hardwall */ + int teardown_in_progress; /* are we tearing this one down? */ + + /* Remaining fields only valid for user-network resources. */ int ulhc_x; /* upper left hand corner x coord */ int ulhc_y; /* upper left hand corner y coord */ int width; /* rectangle width */ int height; /* rectangle height */ - int id; /* integer id for this hardwall */ - int teardown_in_progress; /* are we tearing this one down? */ +#if CHIP_HAS_REV1_XDN() + atomic_t xdn_pending_count; /* cores in phase 1 of drain */ +#endif }; -/* Currently allocated hardwall rectangles */ -static LIST_HEAD(rectangles); /* /proc/tile/hardwall */ static struct proc_dir_entry *hardwall_proc_dir; /* Functions to manage files in /proc/tile/hardwall. */ -static void hardwall_add_proc(struct hardwall_info *rect); -static void hardwall_remove_proc(struct hardwall_info *rect); - -/* - * Guard changes to the hardwall data structures. - * This could be finer grained (e.g. one lock for the list of hardwall - * rectangles, then separate embedded locks for each one's list of tasks), - * but there are subtle correctness issues when trying to start with - * a task's "hardwall" pointer and lock the correct rectangle's embedded - * lock in the presence of a simultaneous deactivation, so it seems - * easier to have a single lock, given that none of these data - * structures are touched very frequently during normal operation. - */ -static DEFINE_SPINLOCK(hardwall_lock); +static void hardwall_add_proc(struct hardwall_info *); +static void hardwall_remove_proc(struct hardwall_info *); /* Allow disabling UDN access. */ -static int udn_disabled; static int __init noudn(char *str) { pr_info("User-space UDN access is disabled\n"); - udn_disabled = 1; + hardwall_types[HARDWALL_UDN].disabled = 1; return 0; } early_param("noudn", noudn); +#ifndef __tilepro__ +/* Allow disabling IDN access. */ +static int __init noidn(char *str) +{ + pr_info("User-space IDN access is disabled\n"); + hardwall_types[HARDWALL_IDN].disabled = 1; + return 0; +} +early_param("noidn", noidn); + +/* Allow disabling IPI access. */ +static int __init noipi(char *str) +{ + pr_info("User-space IPI access is disabled\n"); + hardwall_types[HARDWALL_IPI].disabled = 1; + return 0; +} +early_param("noipi", noipi); +#endif + /* - * Low-level primitives + * Low-level primitives for UDN/IDN */ +#ifdef __tilepro__ +#define mtspr_XDN(hwt, name, val) \ + do { (void)(hwt); __insn_mtspr(SPR_UDN_##name, (val)); } while (0) +#define mtspr_MPL_XDN(hwt, name, val) \ + do { (void)(hwt); __insn_mtspr(SPR_MPL_UDN_##name, (val)); } while (0) +#define mfspr_XDN(hwt, name) \ + ((void)(hwt), __insn_mfspr(SPR_UDN_##name)) +#else +#define mtspr_XDN(hwt, name, val) \ + do { \ + if ((hwt)->is_idn) \ + __insn_mtspr(SPR_IDN_##name, (val)); \ + else \ + __insn_mtspr(SPR_UDN_##name, (val)); \ + } while (0) +#define mtspr_MPL_XDN(hwt, name, val) \ + do { \ + if ((hwt)->is_idn) \ + __insn_mtspr(SPR_MPL_IDN_##name, (val)); \ + else \ + __insn_mtspr(SPR_MPL_UDN_##name, (val)); \ + } while (0) +#define mfspr_XDN(hwt, name) \ + ((hwt)->is_idn ? __insn_mfspr(SPR_IDN_##name) : __insn_mfspr(SPR_UDN_##name)) +#endif + /* Set a CPU bit if the CPU is online. */ #define cpu_online_set(cpu, dst) do { \ if (cpu_online(cpu)) \ @@ -101,7 +199,7 @@ static int contains(struct hardwall_info *r, int x, int y) } /* Compute the rectangle parameters and validate the cpumask. */ -static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask) +static int check_rectangle(struct hardwall_info *r, struct cpumask *mask) { int x, y, cpu, ulhc, lrhc; @@ -114,8 +212,6 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask) r->ulhc_y = cpu_y(ulhc); r->width = cpu_x(lrhc) - r->ulhc_x + 1; r->height = cpu_y(lrhc) - r->ulhc_y + 1; - cpumask_copy(&r->cpumask, mask); - r->id = ulhc; /* The ulhc cpu id can be the hardwall id. */ /* Width and height must be positive */ if (r->width <= 0 || r->height <= 0) @@ -128,7 +224,7 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask) return -EINVAL; /* - * Note that offline cpus can't be drained when this UDN + * Note that offline cpus can't be drained when this user network * rectangle eventually closes. We used to detect this * situation and print a warning, but it annoyed users and * they ignored it anyway, so now we just return without a @@ -137,16 +233,6 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask) return 0; } -/* Do the two given rectangles overlap on any cpu? */ -static int overlaps(struct hardwall_info *a, struct hardwall_info *b) -{ - return a->ulhc_x + a->width > b->ulhc_x && /* A not to the left */ - b->ulhc_x + b->width > a->ulhc_x && /* B not to the left */ - a->ulhc_y + a->height > b->ulhc_y && /* A not above */ - b->ulhc_y + b->height > a->ulhc_y; /* B not above */ -} - - /* * Hardware management of hardwall setup, teardown, trapping, * and enabling/disabling PL0 access to the networks. @@ -157,26 +243,38 @@ enum direction_protect { N_PROTECT = (1 << 0), E_PROTECT = (1 << 1), S_PROTECT = (1 << 2), - W_PROTECT = (1 << 3) + W_PROTECT = (1 << 3), + C_PROTECT = (1 << 4), }; -static void enable_firewall_interrupts(void) +static inline int xdn_which_interrupt(struct hardwall_type *hwt) { - arch_local_irq_unmask_now(INT_UDN_FIREWALL); +#ifndef __tilepro__ + if (hwt->is_idn) + return INT_IDN_FIREWALL; +#endif + return INT_UDN_FIREWALL; } -static void disable_firewall_interrupts(void) +static void enable_firewall_interrupts(struct hardwall_type *hwt) { - arch_local_irq_mask_now(INT_UDN_FIREWALL); + arch_local_irq_unmask_now(xdn_which_interrupt(hwt)); +} + +static void disable_firewall_interrupts(struct hardwall_type *hwt) +{ + arch_local_irq_mask_now(xdn_which_interrupt(hwt)); } /* Set up hardwall on this cpu based on the passed hardwall_info. */ -static void hardwall_setup_ipi_func(void *info) +static void hardwall_setup_func(void *info) { struct hardwall_info *r = info; - int cpu = smp_processor_id(); - int x = cpu % smp_width; - int y = cpu / smp_width; + struct hardwall_type *hwt = r->type; + + int cpu = smp_processor_id(); /* on_each_cpu disables preemption */ + int x = cpu_x(cpu); + int y = cpu_y(cpu); int bits = 0; if (x == r->ulhc_x) bits |= W_PROTECT; @@ -187,13 +285,12 @@ static void hardwall_setup_ipi_func(void *info) if (y == r->ulhc_y + r->height - 1) bits |= S_PROTECT; BUG_ON(bits == 0); - __insn_mtspr(SPR_UDN_DIRECTION_PROTECT, bits); - enable_firewall_interrupts(); - + mtspr_XDN(hwt, DIRECTION_PROTECT, bits); + enable_firewall_interrupts(hwt); } /* Set up all cpus on edge of rectangle to enable/disable hardwall SPRs. */ -static void hardwall_setup(struct hardwall_info *r) +static void hardwall_protect_rectangle(struct hardwall_info *r) { int x, y, cpu, delta; struct cpumask rect_cpus; @@ -217,37 +314,50 @@ static void hardwall_setup(struct hardwall_info *r) } /* Then tell all the cpus to set up their protection SPR */ - on_each_cpu_mask(&rect_cpus, hardwall_setup_ipi_func, r, 1); + on_each_cpu_mask(&rect_cpus, hardwall_setup_func, r, 1); } +/* Entered from INT_xDN_FIREWALL interrupt vector with irqs disabled. */ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num) { struct hardwall_info *rect; + struct hardwall_type *hwt; struct task_struct *p; struct siginfo info; - int x, y; int cpu = smp_processor_id(); int found_processes; - unsigned long flags; - struct pt_regs *old_regs = set_irq_regs(regs); + irq_enter(); + /* Figure out which network trapped. */ + switch (fault_num) { +#ifndef __tilepro__ + case INT_IDN_FIREWALL: + hwt = &hardwall_types[HARDWALL_IDN]; + break; +#endif + case INT_UDN_FIREWALL: + hwt = &hardwall_types[HARDWALL_UDN]; + break; + default: + BUG(); + } + BUG_ON(hwt->disabled); + /* This tile trapped a network access; find the rectangle. */ - x = cpu % smp_width; - y = cpu / smp_width; - spin_lock_irqsave(&hardwall_lock, flags); - list_for_each_entry(rect, &rectangles, list) { - if (contains(rect, x, y)) + spin_lock(&hwt->lock); + list_for_each_entry(rect, &hwt->list, list) { + if (cpumask_test_cpu(cpu, &rect->cpumask)) break; } /* * It shouldn't be possible not to find this cpu on the * rectangle list, since only cpus in rectangles get hardwalled. - * The hardwall is only removed after the UDN is drained. + * The hardwall is only removed after the user network is drained. */ - BUG_ON(&rect->list == &rectangles); + BUG_ON(&rect->list == &hwt->list); /* * If we already started teardown on this hardwall, don't worry; @@ -255,30 +365,32 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num) * to quiesce. */ if (rect->teardown_in_progress) { - pr_notice("cpu %d: detected hardwall violation %#lx" + pr_notice("cpu %d: detected %s hardwall violation %#lx" " while teardown already in progress\n", - cpu, (long) __insn_mfspr(SPR_UDN_DIRECTION_PROTECT)); + cpu, hwt->name, + (long)mfspr_XDN(hwt, DIRECTION_PROTECT)); goto done; } /* * Kill off any process that is activated in this rectangle. * We bypass security to deliver the signal, since it must be - * one of the activated processes that generated the UDN + * one of the activated processes that generated the user network * message that caused this trap, and all the activated * processes shared a single open file so are pretty tightly * bound together from a security point of view to begin with. */ rect->teardown_in_progress = 1; wmb(); /* Ensure visibility of rectangle before notifying processes. */ - pr_notice("cpu %d: detected hardwall violation %#lx...\n", - cpu, (long) __insn_mfspr(SPR_UDN_DIRECTION_PROTECT)); + pr_notice("cpu %d: detected %s hardwall violation %#lx...\n", + cpu, hwt->name, (long)mfspr_XDN(hwt, DIRECTION_PROTECT)); info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_HARDWALL; found_processes = 0; - list_for_each_entry(p, &rect->task_head, thread.hardwall_list) { - BUG_ON(p->thread.hardwall != rect); + list_for_each_entry(p, &rect->task_head, + thread.hardwall[hwt->index].list) { + BUG_ON(p->thread.hardwall[hwt->index].info != rect); if (!(p->flags & PF_EXITING)) { found_processes = 1; pr_notice("hardwall: killing %d\n", p->pid); @@ -289,7 +401,7 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num) pr_notice("hardwall: no associated processes!\n"); done: - spin_unlock_irqrestore(&hardwall_lock, flags); + spin_unlock(&hwt->lock); /* * We have to disable firewall interrupts now, or else when we @@ -298,48 +410,87 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num) * haven't yet drained the network, and that would allow packets * to cross out of the hardwall region. */ - disable_firewall_interrupts(); + disable_firewall_interrupts(hwt); irq_exit(); set_irq_regs(old_regs); } -/* Allow access from user space to the UDN. */ -void grant_network_mpls(void) +/* Allow access from user space to the user network. */ +void grant_hardwall_mpls(struct hardwall_type *hwt) { - __insn_mtspr(SPR_MPL_UDN_ACCESS_SET_0, 1); - __insn_mtspr(SPR_MPL_UDN_AVAIL_SET_0, 1); - __insn_mtspr(SPR_MPL_UDN_COMPLETE_SET_0, 1); - __insn_mtspr(SPR_MPL_UDN_TIMER_SET_0, 1); +#ifndef __tilepro__ + if (!hwt->is_xdn) { + __insn_mtspr(SPR_MPL_IPI_0_SET_0, 1); + return; + } +#endif + mtspr_MPL_XDN(hwt, ACCESS_SET_0, 1); + mtspr_MPL_XDN(hwt, AVAIL_SET_0, 1); + mtspr_MPL_XDN(hwt, COMPLETE_SET_0, 1); + mtspr_MPL_XDN(hwt, TIMER_SET_0, 1); #if !CHIP_HAS_REV1_XDN() - __insn_mtspr(SPR_MPL_UDN_REFILL_SET_0, 1); - __insn_mtspr(SPR_MPL_UDN_CA_SET_0, 1); + mtspr_MPL_XDN(hwt, REFILL_SET_0, 1); + mtspr_MPL_XDN(hwt, CA_SET_0, 1); #endif } -/* Deny access from user space to the UDN. */ -void restrict_network_mpls(void) +/* Deny access from user space to the user network. */ +void restrict_hardwall_mpls(struct hardwall_type *hwt) { - __insn_mtspr(SPR_MPL_UDN_ACCESS_SET_1, 1); - __insn_mtspr(SPR_MPL_UDN_AVAIL_SET_1, 1); - __insn_mtspr(SPR_MPL_UDN_COMPLETE_SET_1, 1); - __insn_mtspr(SPR_MPL_UDN_TIMER_SET_1, 1); +#ifndef __tilepro__ + if (!hwt->is_xdn) { + __insn_mtspr(SPR_MPL_IPI_0_SET_1, 1); + return; + } +#endif + mtspr_MPL_XDN(hwt, ACCESS_SET_1, 1); + mtspr_MPL_XDN(hwt, AVAIL_SET_1, 1); + mtspr_MPL_XDN(hwt, COMPLETE_SET_1, 1); + mtspr_MPL_XDN(hwt, TIMER_SET_1, 1); #if !CHIP_HAS_REV1_XDN() - __insn_mtspr(SPR_MPL_UDN_REFILL_SET_1, 1); - __insn_mtspr(SPR_MPL_UDN_CA_SET_1, 1); + mtspr_MPL_XDN(hwt, REFILL_SET_1, 1); + mtspr_MPL_XDN(hwt, CA_SET_1, 1); #endif } +/* Restrict or deny as necessary for the task we're switching to. */ +void hardwall_switch_tasks(struct task_struct *prev, + struct task_struct *next) +{ + int i; + for (i = 0; i < HARDWALL_TYPES; ++i) { + if (prev->thread.hardwall[i].info != NULL) { + if (next->thread.hardwall[i].info == NULL) + restrict_hardwall_mpls(&hardwall_types[i]); + } else if (next->thread.hardwall[i].info != NULL) { + grant_hardwall_mpls(&hardwall_types[i]); + } + } +} + +/* Does this task have the right to IPI the given cpu? */ +int hardwall_ipi_valid(int cpu) +{ +#ifdef __tilegx__ + struct hardwall_info *info = + current->thread.hardwall[HARDWALL_IPI].info; + return info && cpumask_test_cpu(cpu, &info->cpumask); +#else + return 0; +#endif +} /* - * Code to create, activate, deactivate, and destroy hardwall rectangles. + * Code to create, activate, deactivate, and destroy hardwall resources. */ -/* Create a hardwall for the given rectangle */ -static struct hardwall_info *hardwall_create( - size_t size, const unsigned char __user *bits) +/* Create a hardwall for the given resource */ +static struct hardwall_info *hardwall_create(struct hardwall_type *hwt, + size_t size, + const unsigned char __user *bits) { - struct hardwall_info *iter, *rect; + struct hardwall_info *iter, *info; struct cpumask mask; unsigned long flags; int rc; @@ -370,55 +521,70 @@ static struct hardwall_info *hardwall_create( } } - /* Allocate a new rectangle optimistically. */ - rect = kmalloc(sizeof(struct hardwall_info), + /* Allocate a new hardwall_info optimistically. */ + info = kmalloc(sizeof(struct hardwall_info), GFP_KERNEL | __GFP_ZERO); - if (rect == NULL) + if (info == NULL) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&rect->task_head); + INIT_LIST_HEAD(&info->task_head); + info->type = hwt; /* Compute the rectangle size and validate that it's plausible. */ - rc = setup_rectangle(rect, &mask); - if (rc != 0) { - kfree(rect); - return ERR_PTR(rc); + cpumask_copy(&info->cpumask, &mask); + info->id = find_first_bit(cpumask_bits(&mask), nr_cpumask_bits); + if (hwt->is_xdn) { + rc = check_rectangle(info, &mask); + if (rc != 0) { + kfree(info); + return ERR_PTR(rc); + } } + /* + * Eliminate cpus that are not part of this Linux client. + * Note that this allows for configurations that we might not want to + * support, such as one client on every even cpu, another client on + * every odd cpu. + */ + cpumask_and(&info->cpumask, &info->cpumask, cpu_online_mask); + /* Confirm it doesn't overlap and add it to the list. */ - spin_lock_irqsave(&hardwall_lock, flags); - list_for_each_entry(iter, &rectangles, list) { - if (overlaps(iter, rect)) { - spin_unlock_irqrestore(&hardwall_lock, flags); - kfree(rect); + spin_lock_irqsave(&hwt->lock, flags); + list_for_each_entry(iter, &hwt->list, list) { + if (cpumask_intersects(&iter->cpumask, &info->cpumask)) { + spin_unlock_irqrestore(&hwt->lock, flags); + kfree(info); return ERR_PTR(-EBUSY); } } - list_add_tail(&rect->list, &rectangles); - spin_unlock_irqrestore(&hardwall_lock, flags); + list_add_tail(&info->list, &hwt->list); + spin_unlock_irqrestore(&hwt->lock, flags); /* Set up appropriate hardwalling on all affected cpus. */ - hardwall_setup(rect); + if (hwt->is_xdn) + hardwall_protect_rectangle(info); /* Create a /proc/tile/hardwall entry. */ - hardwall_add_proc(rect); + hardwall_add_proc(info); - return rect; + return info; } /* Activate a given hardwall on this cpu for this process. */ -static int hardwall_activate(struct hardwall_info *rect) +static int hardwall_activate(struct hardwall_info *info) { - int cpu, x, y; + int cpu; unsigned long flags; struct task_struct *p = current; struct thread_struct *ts = &p->thread; + struct hardwall_type *hwt; - /* Require a rectangle. */ - if (rect == NULL) + /* Require a hardwall. */ + if (info == NULL) return -ENODATA; - /* Not allowed to activate a rectangle that is being torn down. */ - if (rect->teardown_in_progress) + /* Not allowed to activate a hardwall that is being torn down. */ + if (info->teardown_in_progress) return -EINVAL; /* @@ -428,78 +594,87 @@ static int hardwall_activate(struct hardwall_info *rect) if (cpumask_weight(&p->cpus_allowed) != 1) return -EPERM; - /* Make sure we are bound to a cpu in this rectangle. */ + /* Make sure we are bound to a cpu assigned to this resource. */ cpu = smp_processor_id(); BUG_ON(cpumask_first(&p->cpus_allowed) != cpu); - x = cpu_x(cpu); - y = cpu_y(cpu); - if (!contains(rect, x, y)) + if (!cpumask_test_cpu(cpu, &info->cpumask)) return -EINVAL; /* If we are already bound to this hardwall, it's a no-op. */ - if (ts->hardwall) { - BUG_ON(ts->hardwall != rect); + hwt = info->type; + if (ts->hardwall[hwt->index].info) { + BUG_ON(ts->hardwall[hwt->index].info != info); return 0; } - /* Success! This process gets to use the user networks on this cpu. */ - ts->hardwall = rect; - spin_lock_irqsave(&hardwall_lock, flags); - list_add(&ts->hardwall_list, &rect->task_head); - spin_unlock_irqrestore(&hardwall_lock, flags); - grant_network_mpls(); - printk(KERN_DEBUG "Pid %d (%s) activated for hardwall: cpu %d\n", - p->pid, p->comm, cpu); + /* Success! This process gets to use the resource on this cpu. */ + ts->hardwall[hwt->index].info = info; + spin_lock_irqsave(&hwt->lock, flags); + list_add(&ts->hardwall[hwt->index].list, &info->task_head); + spin_unlock_irqrestore(&hwt->lock, flags); + grant_hardwall_mpls(hwt); + printk(KERN_DEBUG "Pid %d (%s) activated for %s hardwall: cpu %d\n", + p->pid, p->comm, hwt->name, cpu); return 0; } /* - * Deactivate a task's hardwall. Must hold hardwall_lock. - * This method may be called from free_task(), so we don't want to + * Deactivate a task's hardwall. Must hold lock for hardwall_type. + * This method may be called from exit_thread(), so we don't want to * rely on too many fields of struct task_struct still being valid. * We assume the cpus_allowed, pid, and comm fields are still valid. */ -static void _hardwall_deactivate(struct task_struct *task) +static void _hardwall_deactivate(struct hardwall_type *hwt, + struct task_struct *task) { struct thread_struct *ts = &task->thread; if (cpumask_weight(&task->cpus_allowed) != 1) { - pr_err("pid %d (%s) releasing networks with" + pr_err("pid %d (%s) releasing %s hardwall with" " an affinity mask containing %d cpus!\n", - task->pid, task->comm, + task->pid, task->comm, hwt->name, cpumask_weight(&task->cpus_allowed)); BUG(); } - BUG_ON(ts->hardwall == NULL); - ts->hardwall = NULL; - list_del(&ts->hardwall_list); + BUG_ON(ts->hardwall[hwt->index].info == NULL); + ts->hardwall[hwt->index].info = NULL; + list_del(&ts->hardwall[hwt->index].list); if (task == current) - restrict_network_mpls(); + restrict_hardwall_mpls(hwt); } /* Deactivate a task's hardwall. */ -int hardwall_deactivate(struct task_struct *task) +static int hardwall_deactivate(struct hardwall_type *hwt, + struct task_struct *task) { unsigned long flags; int activated; - spin_lock_irqsave(&hardwall_lock, flags); - activated = (task->thread.hardwall != NULL); + spin_lock_irqsave(&hwt->lock, flags); + activated = (task->thread.hardwall[hwt->index].info != NULL); if (activated) - _hardwall_deactivate(task); - spin_unlock_irqrestore(&hardwall_lock, flags); + _hardwall_deactivate(hwt, task); + spin_unlock_irqrestore(&hwt->lock, flags); if (!activated) return -EINVAL; - printk(KERN_DEBUG "Pid %d (%s) deactivated for hardwall: cpu %d\n", - task->pid, task->comm, smp_processor_id()); + printk(KERN_DEBUG "Pid %d (%s) deactivated for %s hardwall: cpu %d\n", + task->pid, task->comm, hwt->name, raw_smp_processor_id()); return 0; } -/* Stop a UDN switch before draining the network. */ -static void stop_udn_switch(void *ignored) +void hardwall_deactivate_all(struct task_struct *task) +{ + int i; + for (i = 0; i < HARDWALL_TYPES; ++i) + if (task->thread.hardwall[i].info) + hardwall_deactivate(&hardwall_types[i], task); +} + +/* Stop the switch before draining the network. */ +static void stop_xdn_switch(void *arg) { #if !CHIP_HAS_REV1_XDN() /* Freeze the switch and the demux. */ @@ -507,13 +682,71 @@ static void stop_udn_switch(void *ignored) SPR_UDN_SP_FREEZE__SP_FRZ_MASK | SPR_UDN_SP_FREEZE__DEMUX_FRZ_MASK | SPR_UDN_SP_FREEZE__NON_DEST_EXT_MASK); +#else + /* + * Drop all packets bound for the core or off the edge. + * We rely on the normal hardwall protection setup code + * to have set the low four bits to trigger firewall interrupts, + * and shift those bits up to trigger "drop on send" semantics, + * plus adding "drop on send to core" for all switches. + * In practice it seems the switches latch the DIRECTION_PROTECT + * SPR so they won't start dropping if they're already + * delivering the last message to the core, but it doesn't + * hurt to enable it here. + */ + struct hardwall_type *hwt = arg; + unsigned long protect = mfspr_XDN(hwt, DIRECTION_PROTECT); + mtspr_XDN(hwt, DIRECTION_PROTECT, (protect | C_PROTECT) << 5); #endif } +static void empty_xdn_demuxes(struct hardwall_type *hwt) +{ +#ifndef __tilepro__ + if (hwt->is_idn) { + while (__insn_mfspr(SPR_IDN_DATA_AVAIL) & (1 << 0)) + (void) __tile_idn0_receive(); + while (__insn_mfspr(SPR_IDN_DATA_AVAIL) & (1 << 1)) + (void) __tile_idn1_receive(); + return; + } +#endif + while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 0)) + (void) __tile_udn0_receive(); + while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 1)) + (void) __tile_udn1_receive(); + while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 2)) + (void) __tile_udn2_receive(); + while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 3)) + (void) __tile_udn3_receive(); +} + /* Drain all the state from a stopped switch. */ -static void drain_udn_switch(void *ignored) +static void drain_xdn_switch(void *arg) { -#if !CHIP_HAS_REV1_XDN() + struct hardwall_info *info = arg; + struct hardwall_type *hwt = info->type; + +#if CHIP_HAS_REV1_XDN() + /* + * The switches have been configured to drop any messages + * destined for cores (or off the edge of the rectangle). + * But the current message may continue to be delivered, + * so we wait until all the cores have finished any pending + * messages before we stop draining. + */ + int pending = mfspr_XDN(hwt, PENDING); + while (pending--) { + empty_xdn_demuxes(hwt); + if (hwt->is_idn) + __tile_idn_send(0); + else + __tile_udn_send(0); + } + atomic_dec(&info->xdn_pending_count); + while (atomic_read(&info->xdn_pending_count)) + empty_xdn_demuxes(hwt); +#else int i; int from_tile_words, ca_count; @@ -533,15 +766,7 @@ static void drain_udn_switch(void *ignored) (void) __insn_mfspr(SPR_UDN_DEMUX_WRITE_FIFO); /* Empty out demuxes. */ - while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 0)) - (void) __tile_udn0_receive(); - while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 1)) - (void) __tile_udn1_receive(); - while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 2)) - (void) __tile_udn2_receive(); - while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 3)) - (void) __tile_udn3_receive(); - BUG_ON((__insn_mfspr(SPR_UDN_DATA_AVAIL) & 0xF) != 0); + empty_xdn_demuxes(hwt); /* Empty out catch all. */ ca_count = __insn_mfspr(SPR_UDN_DEMUX_CA_COUNT); @@ -563,21 +788,25 @@ static void drain_udn_switch(void *ignored) #endif } -/* Reset random UDN state registers at boot up and during hardwall teardown. */ -void reset_network_state(void) +/* Reset random XDN state registers at boot up and during hardwall teardown. */ +static void reset_xdn_network_state(struct hardwall_type *hwt) { -#if !CHIP_HAS_REV1_XDN() - /* Reset UDN coordinates to their standard value */ - unsigned int cpu = smp_processor_id(); - unsigned int x = cpu % smp_width; - unsigned int y = cpu / smp_width; -#endif - - if (udn_disabled) + if (hwt->disabled) return; + /* Clear out other random registers so we have a clean slate. */ + mtspr_XDN(hwt, DIRECTION_PROTECT, 0); + mtspr_XDN(hwt, AVAIL_EN, 0); + mtspr_XDN(hwt, DEADLOCK_TIMEOUT, 0); + #if !CHIP_HAS_REV1_XDN() - __insn_mtspr(SPR_UDN_TILE_COORD, (x << 18) | (y << 7)); + /* Reset UDN coordinates to their standard value */ + { + unsigned int cpu = smp_processor_id(); + unsigned int x = cpu_x(cpu); + unsigned int y = cpu_y(cpu); + __insn_mtspr(SPR_UDN_TILE_COORD, (x << 18) | (y << 7)); + } /* Set demux tags to predefined values and enable them. */ __insn_mtspr(SPR_UDN_TAG_VALID, 0xf); @@ -585,56 +814,50 @@ void reset_network_state(void) __insn_mtspr(SPR_UDN_TAG_1, (1 << 1)); __insn_mtspr(SPR_UDN_TAG_2, (1 << 2)); __insn_mtspr(SPR_UDN_TAG_3, (1 << 3)); -#endif - /* Clear out other random registers so we have a clean slate. */ - __insn_mtspr(SPR_UDN_AVAIL_EN, 0); - __insn_mtspr(SPR_UDN_DEADLOCK_TIMEOUT, 0); -#if !CHIP_HAS_REV1_XDN() + /* Set other rev0 random registers to a clean state. */ __insn_mtspr(SPR_UDN_REFILL_EN, 0); __insn_mtspr(SPR_UDN_DEMUX_QUEUE_SEL, 0); __insn_mtspr(SPR_UDN_SP_FIFO_SEL, 0); -#endif /* Start the switch and demux. */ -#if !CHIP_HAS_REV1_XDN() __insn_mtspr(SPR_UDN_SP_FREEZE, 0); #endif } -/* Restart a UDN switch after draining. */ -static void restart_udn_switch(void *ignored) +void reset_network_state(void) { - reset_network_state(); - - /* Disable firewall interrupts. */ - __insn_mtspr(SPR_UDN_DIRECTION_PROTECT, 0); - disable_firewall_interrupts(); + reset_xdn_network_state(&hardwall_types[HARDWALL_UDN]); +#ifndef __tilepro__ + reset_xdn_network_state(&hardwall_types[HARDWALL_IDN]); +#endif } -/* Build a struct cpumask containing all valid tiles in bounding rectangle. */ -static void fill_mask(struct hardwall_info *r, struct cpumask *result) +/* Restart an XDN switch after draining. */ +static void restart_xdn_switch(void *arg) { - int x, y, cpu; + struct hardwall_type *hwt = arg; - cpumask_clear(result); +#if CHIP_HAS_REV1_XDN() + /* One last drain step to avoid races with injection and draining. */ + empty_xdn_demuxes(hwt); +#endif - cpu = r->ulhc_y * smp_width + r->ulhc_x; - for (y = 0; y < r->height; ++y, cpu += smp_width - r->width) { - for (x = 0; x < r->width; ++x, ++cpu) - cpu_online_set(cpu, result); - } + reset_xdn_network_state(hwt); + + /* Disable firewall interrupts. */ + disable_firewall_interrupts(hwt); } /* Last reference to a hardwall is gone, so clear the network. */ -static void hardwall_destroy(struct hardwall_info *rect) +static void hardwall_destroy(struct hardwall_info *info) { struct task_struct *task; + struct hardwall_type *hwt; unsigned long flags; - struct cpumask mask; - /* Make sure this file actually represents a rectangle. */ - if (rect == NULL) + /* Make sure this file actually represents a hardwall. */ + if (info == NULL) return; /* @@ -644,39 +867,53 @@ static void hardwall_destroy(struct hardwall_info *rect) * deactivate any remaining tasks before freeing the * hardwall_info object itself. */ - spin_lock_irqsave(&hardwall_lock, flags); - list_for_each_entry(task, &rect->task_head, thread.hardwall_list) - _hardwall_deactivate(task); - spin_unlock_irqrestore(&hardwall_lock, flags); - - /* Drain the UDN. */ - printk(KERN_DEBUG "Clearing hardwall rectangle %dx%d %d,%d\n", - rect->width, rect->height, rect->ulhc_x, rect->ulhc_y); - fill_mask(rect, &mask); - on_each_cpu_mask(&mask, stop_udn_switch, NULL, 1); - on_each_cpu_mask(&mask, drain_udn_switch, NULL, 1); + hwt = info->type; + info->teardown_in_progress = 1; + spin_lock_irqsave(&hwt->lock, flags); + list_for_each_entry(task, &info->task_head, + thread.hardwall[hwt->index].list) + _hardwall_deactivate(hwt, task); + spin_unlock_irqrestore(&hwt->lock, flags); + + if (hwt->is_xdn) { + /* Configure the switches for draining the user network. */ + printk(KERN_DEBUG + "Clearing %s hardwall rectangle %dx%d %d,%d\n", + hwt->name, info->width, info->height, + info->ulhc_x, info->ulhc_y); + on_each_cpu_mask(&info->cpumask, stop_xdn_switch, hwt, 1); + + /* Drain the network. */ +#if CHIP_HAS_REV1_XDN() + atomic_set(&info->xdn_pending_count, + cpumask_weight(&info->cpumask)); + on_each_cpu_mask(&info->cpumask, drain_xdn_switch, info, 0); +#else + on_each_cpu_mask(&info->cpumask, drain_xdn_switch, info, 1); +#endif - /* Restart switch and disable firewall. */ - on_each_cpu_mask(&mask, restart_udn_switch, NULL, 1); + /* Restart switch and disable firewall. */ + on_each_cpu_mask(&info->cpumask, restart_xdn_switch, hwt, 1); + } /* Remove the /proc/tile/hardwall entry. */ - hardwall_remove_proc(rect); - - /* Now free the rectangle from the list. */ - spin_lock_irqsave(&hardwall_lock, flags); - BUG_ON(!list_empty(&rect->task_head)); - list_del(&rect->list); - spin_unlock_irqrestore(&hardwall_lock, flags); - kfree(rect); + hardwall_remove_proc(info); + + /* Now free the hardwall from the list. */ + spin_lock_irqsave(&hwt->lock, flags); + BUG_ON(!list_empty(&info->task_head)); + list_del(&info->list); + spin_unlock_irqrestore(&hwt->lock, flags); + kfree(info); } static int hardwall_proc_show(struct seq_file *sf, void *v) { - struct hardwall_info *rect = sf->private; + struct hardwall_info *info = sf->private; char buf[256]; - int rc = cpulist_scnprintf(buf, sizeof(buf), &rect->cpumask); + int rc = cpulist_scnprintf(buf, sizeof(buf), &info->cpumask); buf[rc++] = '\n'; seq_write(sf, buf, rc); return 0; @@ -685,7 +922,7 @@ static int hardwall_proc_show(struct seq_file *sf, void *v) static int hardwall_proc_open(struct inode *inode, struct file *file) { - return single_open(file, hardwall_proc_show, PDE(inode)->data); + return single_open(file, hardwall_proc_show, PDE_DATA(inode)); } static const struct file_operations hardwall_proc_fops = { @@ -695,31 +932,45 @@ static const struct file_operations hardwall_proc_fops = { .release = single_release, }; -static void hardwall_add_proc(struct hardwall_info *rect) +static void hardwall_add_proc(struct hardwall_info *info) { char buf[64]; - snprintf(buf, sizeof(buf), "%d", rect->id); - proc_create_data(buf, 0444, hardwall_proc_dir, - &hardwall_proc_fops, rect); + snprintf(buf, sizeof(buf), "%d", info->id); + proc_create_data(buf, 0444, info->type->proc_dir, + &hardwall_proc_fops, info); } -static void hardwall_remove_proc(struct hardwall_info *rect) +static void hardwall_remove_proc(struct hardwall_info *info) { char buf[64]; - snprintf(buf, sizeof(buf), "%d", rect->id); - remove_proc_entry(buf, hardwall_proc_dir); + snprintf(buf, sizeof(buf), "%d", info->id); + remove_proc_entry(buf, info->type->proc_dir); } int proc_pid_hardwall(struct task_struct *task, char *buffer) { - struct hardwall_info *rect = task->thread.hardwall; - return rect ? sprintf(buffer, "%d\n", rect->id) : 0; + int i; + int n = 0; + for (i = 0; i < HARDWALL_TYPES; ++i) { + struct hardwall_info *info = task->thread.hardwall[i].info; + if (info) + n += sprintf(&buffer[n], "%s: %d\n", + info->type->name, info->id); + } + return n; } void proc_tile_hardwall_init(struct proc_dir_entry *root) { - if (!udn_disabled) - hardwall_proc_dir = proc_mkdir("hardwall", root); + int i; + for (i = 0; i < HARDWALL_TYPES; ++i) { + struct hardwall_type *hwt = &hardwall_types[i]; + if (hwt->disabled) + continue; + if (hardwall_proc_dir == NULL) + hardwall_proc_dir = proc_mkdir("hardwall", root); + hwt->proc_dir = proc_mkdir(hwt->name, hardwall_proc_dir); + } } @@ -729,34 +980,45 @@ void proc_tile_hardwall_init(struct proc_dir_entry *root) static long hardwall_ioctl(struct file *file, unsigned int a, unsigned long b) { - struct hardwall_info *rect = file->private_data; + struct hardwall_info *info = file->private_data; + int minor = iminor(file->f_mapping->host); + struct hardwall_type* hwt; if (_IOC_TYPE(a) != HARDWALL_IOCTL_BASE) return -EINVAL; + BUILD_BUG_ON(HARDWALL_TYPES != _HARDWALL_TYPES); + BUILD_BUG_ON(HARDWALL_TYPES != + sizeof(hardwall_types)/sizeof(hardwall_types[0])); + + if (minor < 0 || minor >= HARDWALL_TYPES) + return -EINVAL; + hwt = &hardwall_types[minor]; + WARN_ON(info && hwt != info->type); + switch (_IOC_NR(a)) { case _HARDWALL_CREATE: - if (udn_disabled) + if (hwt->disabled) return -ENOSYS; - if (rect != NULL) + if (info != NULL) return -EALREADY; - rect = hardwall_create(_IOC_SIZE(a), - (const unsigned char __user *)b); - if (IS_ERR(rect)) - return PTR_ERR(rect); - file->private_data = rect; + info = hardwall_create(hwt, _IOC_SIZE(a), + (const unsigned char __user *)b); + if (IS_ERR(info)) + return PTR_ERR(info); + file->private_data = info; return 0; case _HARDWALL_ACTIVATE: - return hardwall_activate(rect); + return hardwall_activate(info); case _HARDWALL_DEACTIVATE: - if (current->thread.hardwall != rect) + if (current->thread.hardwall[hwt->index].info != info) return -EINVAL; - return hardwall_deactivate(current); + return hardwall_deactivate(hwt, current); case _HARDWALL_GET_ID: - return rect ? rect->id : -EINVAL; + return info ? info->id : -EINVAL; default: return -EINVAL; @@ -775,26 +1037,28 @@ static long hardwall_compat_ioctl(struct file *file, /* The user process closed the file; revoke access to user networks. */ static int hardwall_flush(struct file *file, fl_owner_t owner) { - struct hardwall_info *rect = file->private_data; + struct hardwall_info *info = file->private_data; struct task_struct *task, *tmp; unsigned long flags; - if (rect) { + if (info) { /* * NOTE: if multiple threads are activated on this hardwall * file, the other threads will continue having access to the - * UDN until they are context-switched out and back in again. + * user network until they are context-switched out and back + * in again. * * NOTE: A NULL files pointer means the task is being torn * down, so in that case we also deactivate it. */ - spin_lock_irqsave(&hardwall_lock, flags); - list_for_each_entry_safe(task, tmp, &rect->task_head, - thread.hardwall_list) { + struct hardwall_type *hwt = info->type; + spin_lock_irqsave(&hwt->lock, flags); + list_for_each_entry_safe(task, tmp, &info->task_head, + thread.hardwall[hwt->index].list) { if (task->files == owner || task->files == NULL) - _hardwall_deactivate(task); + _hardwall_deactivate(hwt, task); } - spin_unlock_irqrestore(&hardwall_lock, flags); + spin_unlock_irqrestore(&hwt->lock, flags); } return 0; @@ -824,11 +1088,11 @@ static int __init dev_hardwall_init(void) int rc; dev_t dev; - rc = alloc_chrdev_region(&dev, 0, 1, "hardwall"); + rc = alloc_chrdev_region(&dev, 0, HARDWALL_TYPES, "hardwall"); if (rc < 0) return rc; cdev_init(&hardwall_dev, &dev_hardwall_fops); - rc = cdev_add(&hardwall_dev, dev, 1); + rc = cdev_add(&hardwall_dev, dev, HARDWALL_TYPES); if (rc < 0) return rc; diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S index 1a39b7c1c87..8d5b40ff292 100644 --- a/arch/tile/kernel/head_32.S +++ b/arch/tile/kernel/head_32.S @@ -38,13 +38,13 @@ ENTRY(_start) movei r2, TILE_CHIP_REV } { - moveli r0, _HV_VERSION - jal hv_init + moveli r0, _HV_VERSION_OLD_HV_INIT + jal _hv_init } /* Get a reasonable default ASID in r0 */ { move r0, zero - jal hv_inquire_asid + jal _hv_inquire_asid } /* Install the default page table */ { @@ -64,21 +64,21 @@ ENTRY(_start) auli r0, r0, ha16(swapper_pg_dir - PAGE_OFFSET) } { - inv r6 + finv r6 move r1, zero /* high 32 bits of CPA is zero */ } { moveli lr, lo16(1f) - move r5, zero + moveli r5, CTX_PAGE_FLAG } { auli lr, lr, ha16(1f) - j hv_install_context + j _hv_install_context } 1: /* Get our processor number and save it away in SAVE_K_0. */ - jal hv_inquire_topology + jal _hv_inquire_topology mulll_uu r4, r1, r2 /* r1 == y, r2 == width */ add r4, r4, r0 /* r0 == x, so r4 == cpu == y*width + x */ @@ -86,7 +86,7 @@ ENTRY(_start) /* * Load up our per-cpu offset. When the first (master) tile * boots, this value is still zero, so we will load boot_pc - * with start_kernel, and boot_sp with init_stack + THREAD_SIZE. + * with start_kernel, and boot_sp at the top of init_stack. * The master tile initializes the per-cpu offset array, so that * when subsequent (secondary) tiles boot, they will instead load * from their per-cpu versions of boot_sp and boot_pc. @@ -126,7 +126,6 @@ ENTRY(_start) lw sp, r1 or r4, sp, r4 mtspr SPR_SYSTEM_SAVE_K_0, r4 /* save ksp0 + cpu */ - addi sp, sp, -STACK_TOP_DELTA { move lr, zero /* stop backtraces in the called function */ jr r0 @@ -141,11 +140,11 @@ ENTRY(empty_zero_page) .macro PTE va, cpa, bits1, no_org=0 .ifeq \no_org - .org swapper_pg_dir + HV_L1_INDEX(\va) * HV_PTE_SIZE + .org swapper_pg_dir + PGD_INDEX(\va) * HV_PTE_SIZE .endif .word HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED | \ (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE) - .word (\bits1) | (HV_CPA_TO_PFN(\cpa) << (HV_PTE_INDEX_PFN - 32)) + .word (\bits1) | (HV_CPA_TO_PTFN(\cpa) << (HV_PTE_INDEX_PTFN - 32)) .endm __PAGE_ALIGNED_DATA @@ -163,10 +162,10 @@ ENTRY(swapper_pg_dir) .set addr, addr + PGDIR_SIZE .endr - /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */ - PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \ + /* The true text VAs are mapped as VA = PA + MEM_SV_START */ + PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \ (1 << (HV_PTE_INDEX_EXECUTABLE - 32)) - .org swapper_pg_dir + HV_L1_SIZE + .org swapper_pg_dir + PGDIR_SIZE END(swapper_pg_dir) /* diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S index 6bc3a932fe4..bd0e12f283f 100644 --- a/arch/tile/kernel/head_64.S +++ b/arch/tile/kernel/head_64.S @@ -25,6 +25,15 @@ #include <arch/chip.h> #include <arch/spr_def.h> +/* Extract two 32-bit bit values that were read into one register. */ +#ifdef __BIG_ENDIAN__ +#define GET_FIRST_INT(rd, rs) shrsi rd, rs, 32 +#define GET_SECOND_INT(rd, rs) addxi rd, rs, 0 +#else +#define GET_FIRST_INT(rd, rs) addxi rd, rs, 0 +#define GET_SECOND_INT(rd, rs) shrsi rd, rs, 32 +#endif + /* * This module contains the entry code for kernel images. It performs the * minimal setup needed to call the generic C routines. @@ -34,17 +43,23 @@ ENTRY(_start) /* Notify the hypervisor of what version of the API we want */ { +#if KERNEL_PL == 1 && _HV_VERSION == 13 + /* Support older hypervisors by asking for API version 12. */ + movei r0, _HV_VERSION_OLD_HV_INIT +#else + movei r0, _HV_VERSION +#endif movei r1, TILE_CHIP - movei r2, TILE_CHIP_REV } { - moveli r0, _HV_VERSION - jal hv_init + movei r2, TILE_CHIP_REV + movei r3, KERNEL_PL } + jal _hv_init /* Get a reasonable default ASID in r0 */ { move r0, zero - jal hv_inquire_asid + jal _hv_inquire_asid } /* @@ -55,7 +70,7 @@ ENTRY(_start) * other CPUs should see a properly-constructed page table. */ { - v4int_l r2, zero, r0 /* ASID for hv_install_context */ + GET_FIRST_INT(r2, r0) /* ASID for hv_install_context */ moveli r4, hw1_last(swapper_pgprot - PAGE_OFFSET) } { @@ -71,7 +86,7 @@ ENTRY(_start) { /* After initializing swapper_pgprot, HV_PTE_GLOBAL is set. */ bfextu r7, r1, HV_PTE_INDEX_GLOBAL, HV_PTE_INDEX_GLOBAL - inv r4 + finv r4 } bnez r7, .Lno_write { @@ -114,30 +129,25 @@ ENTRY(_start) shl16insli r0, r0, hw0(swapper_pg_dir - PAGE_OFFSET) } { - move r3, zero - j hv_install_context + moveli r3, CTX_PAGE_FLAG + j _hv_install_context } 1: /* Install the interrupt base. */ - moveli r0, hw2_last(MEM_SV_START) - shl16insli r0, r0, hw1(MEM_SV_START) - shl16insli r0, r0, hw0(MEM_SV_START) + moveli r0, hw2_last(intrpt_start) + shl16insli r0, r0, hw1(intrpt_start) + shl16insli r0, r0, hw0(intrpt_start) mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0 - /* - * Get our processor number and save it away in SAVE_K_0. - * Extract stuff from the topology structure: r4 = y, r6 = x, - * r5 = width. FIXME: consider whether we want to just make these - * 64-bit values (and if so fix smp_topology write below, too). - */ - jal hv_inquire_topology + /* Get our processor number and save it away in SAVE_K_0. */ + jal _hv_inquire_topology { - v4int_l r5, zero, r1 /* r5 = width */ - shrui r4, r0, 32 /* r4 = y */ + GET_FIRST_INT(r5, r1) /* r5 = width */ + GET_SECOND_INT(r4, r0) /* r4 = y */ } { - v4int_l r6, zero, r0 /* r6 = x */ + GET_FIRST_INT(r6, r0) /* r6 = x */ mul_lu_lu r4, r4, r5 } { @@ -148,7 +158,7 @@ ENTRY(_start) /* * Load up our per-cpu offset. When the first (master) tile * boots, this value is still zero, so we will load boot_pc - * with start_kernel, and boot_sp with init_stack + THREAD_SIZE. + * with start_kernel, and boot_sp with at the top of init_stack. * The master tile initializes the per-cpu offset array, so that * when subsequent (secondary) tiles boot, they will instead load * from their per-cpu versions of boot_sp and boot_pc. @@ -192,9 +202,9 @@ ENTRY(_start) } ld r0, r0 ld sp, r1 - or r4, sp, r4 + shli r4, r4, CPU_SHIFT + bfins r4, sp, 0, CPU_SHIFT-1 mtspr SPR_SYSTEM_SAVE_K_0, r4 /* save ksp0 + cpu */ - addi sp, sp, -STACK_TOP_DELTA { move lr, zero /* stop backtraces in the called function */ jr r0 @@ -210,19 +220,19 @@ ENTRY(empty_zero_page) .macro PTE cpa, bits1 .quad HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED |\ HV_PTE_GLOBAL | (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE) |\ - (\bits1) | (HV_CPA_TO_PFN(\cpa) << HV_PTE_INDEX_PFN) + (\bits1) | (HV_CPA_TO_PTFN(\cpa) << HV_PTE_INDEX_PTFN) .endm __PAGE_ALIGNED_DATA .align PAGE_SIZE ENTRY(swapper_pg_dir) - .org swapper_pg_dir + HV_L0_INDEX(PAGE_OFFSET) * HV_PTE_SIZE + .org swapper_pg_dir + PGD_INDEX(PAGE_OFFSET) * HV_PTE_SIZE .Lsv_data_pmd: .quad 0 /* PTE temp_data_pmd - PAGE_OFFSET, 0 */ - .org swapper_pg_dir + HV_L0_INDEX(MEM_SV_START) * HV_PTE_SIZE + .org swapper_pg_dir + PGD_INDEX(MEM_SV_START) * HV_PTE_SIZE .Lsv_code_pmd: .quad 0 /* PTE temp_code_pmd - PAGE_OFFSET, 0 */ - .org swapper_pg_dir + HV_L0_SIZE + .org swapper_pg_dir + SIZEOF_PGD END(swapper_pg_dir) .align HV_PAGE_TABLE_ALIGN @@ -233,11 +243,11 @@ ENTRY(temp_data_pmd) * permissions later. */ .set addr, 0 - .rept HV_L1_ENTRIES + .rept PTRS_PER_PMD PTE addr, HV_PTE_READABLE | HV_PTE_WRITABLE - .set addr, addr + HV_PAGE_SIZE_LARGE + .set addr, addr + HPAGE_SIZE .endr - .org temp_data_pmd + HV_L1_SIZE + .org temp_data_pmd + SIZEOF_PMD END(temp_data_pmd) .align HV_PAGE_TABLE_ALIGN @@ -248,11 +258,11 @@ ENTRY(temp_code_pmd) * permissions later. */ .set addr, 0 - .rept HV_L1_ENTRIES + .rept PTRS_PER_PMD PTE addr, HV_PTE_READABLE | HV_PTE_EXECUTABLE - .set addr, addr + HV_PAGE_SIZE_LARGE + .set addr, addr + HPAGE_SIZE .endr - .org temp_code_pmd + HV_L1_SIZE + .org temp_code_pmd + SIZEOF_PMD END(temp_code_pmd) /* diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S new file mode 100644 index 00000000000..2ab45662239 --- /dev/null +++ b/arch/tile/kernel/hvglue.S @@ -0,0 +1,74 @@ +/* Hypervisor call vector addresses; see <hv/hypervisor.h> */ +.macro gensym sym, val, size +.org \val +.global _\sym +.type _\sym,function +_\sym: +.size _\sym,\size +#ifndef CONFIG_TILE_HVGLUE_TRACE +.globl \sym +.set \sym,_\sym +#endif +.endm + +.section .hvglue,"x",@nobits +.align 8 +gensym hv_init, 0x20, 32 +gensym hv_install_context, 0x40, 32 +gensym hv_sysconf, 0x60, 32 +gensym hv_get_rtc, 0x80, 32 +gensym hv_set_rtc, 0xa0, 32 +gensym hv_flush_asid, 0xc0, 32 +gensym hv_flush_page, 0xe0, 32 +gensym hv_flush_pages, 0x100, 32 +gensym hv_restart, 0x120, 32 +gensym hv_halt, 0x140, 32 +gensym hv_power_off, 0x160, 32 +gensym hv_inquire_physical, 0x180, 32 +gensym hv_inquire_memory_controller, 0x1a0, 32 +gensym hv_inquire_virtual, 0x1c0, 32 +gensym hv_inquire_asid, 0x1e0, 32 +gensym hv_nanosleep, 0x200, 32 +gensym hv_console_read_if_ready, 0x220, 32 +gensym hv_console_write, 0x240, 32 +gensym hv_downcall_dispatch, 0x260, 32 +gensym hv_inquire_topology, 0x280, 32 +gensym hv_fs_findfile, 0x2a0, 32 +gensym hv_fs_fstat, 0x2c0, 32 +gensym hv_fs_pread, 0x2e0, 32 +gensym hv_physaddr_read64, 0x300, 32 +gensym hv_physaddr_write64, 0x320, 32 +gensym hv_get_command_line, 0x340, 32 +gensym hv_set_caching, 0x360, 32 +gensym hv_bzero_page, 0x380, 32 +gensym hv_register_message_state, 0x3a0, 32 +gensym hv_send_message, 0x3c0, 32 +gensym hv_receive_message, 0x3e0, 32 +gensym hv_inquire_context, 0x400, 32 +gensym hv_start_all_tiles, 0x420, 32 +gensym hv_dev_open, 0x440, 32 +gensym hv_dev_close, 0x460, 32 +gensym hv_dev_pread, 0x480, 32 +gensym hv_dev_pwrite, 0x4a0, 32 +gensym hv_dev_poll, 0x4c0, 32 +gensym hv_dev_poll_cancel, 0x4e0, 32 +gensym hv_dev_preada, 0x500, 32 +gensym hv_dev_pwritea, 0x520, 32 +gensym hv_flush_remote, 0x540, 32 +gensym hv_console_putc, 0x560, 32 +gensym hv_inquire_tiles, 0x580, 32 +gensym hv_confstr, 0x5a0, 32 +gensym hv_reexec, 0x5c0, 32 +gensym hv_set_command_line, 0x5e0, 32 +gensym hv_clear_intr, 0x600, 32 +gensym hv_enable_intr, 0x620, 32 +gensym hv_disable_intr, 0x640, 32 +gensym hv_raise_intr, 0x660, 32 +gensym hv_trigger_ipi, 0x680, 32 +gensym hv_store_mapping, 0x6a0, 32 +gensym hv_inquire_realpa, 0x6c0, 32 +gensym hv_flush_all, 0x6e0, 32 +gensym hv_get_ipi_pte, 0x700, 32 +gensym hv_set_pte_super_shift, 0x720, 32 +gensym hv_console_set_ipi, 0x7e0, 32 +gensym hv_glue_internals, 0x800, 30720 diff --git a/arch/tile/kernel/hvglue.lds b/arch/tile/kernel/hvglue.lds deleted file mode 100644 index 2b7cd0a659a..00000000000 --- a/arch/tile/kernel/hvglue.lds +++ /dev/null @@ -1,58 +0,0 @@ -/* Hypervisor call vector addresses; see <hv/hypervisor.h> */ -hv_init = TEXT_OFFSET + 0x10020; -hv_install_context = TEXT_OFFSET + 0x10040; -hv_sysconf = TEXT_OFFSET + 0x10060; -hv_get_rtc = TEXT_OFFSET + 0x10080; -hv_set_rtc = TEXT_OFFSET + 0x100a0; -hv_flush_asid = TEXT_OFFSET + 0x100c0; -hv_flush_page = TEXT_OFFSET + 0x100e0; -hv_flush_pages = TEXT_OFFSET + 0x10100; -hv_restart = TEXT_OFFSET + 0x10120; -hv_halt = TEXT_OFFSET + 0x10140; -hv_power_off = TEXT_OFFSET + 0x10160; -hv_inquire_physical = TEXT_OFFSET + 0x10180; -hv_inquire_memory_controller = TEXT_OFFSET + 0x101a0; -hv_inquire_virtual = TEXT_OFFSET + 0x101c0; -hv_inquire_asid = TEXT_OFFSET + 0x101e0; -hv_nanosleep = TEXT_OFFSET + 0x10200; -hv_console_read_if_ready = TEXT_OFFSET + 0x10220; -hv_console_write = TEXT_OFFSET + 0x10240; -hv_downcall_dispatch = TEXT_OFFSET + 0x10260; -hv_inquire_topology = TEXT_OFFSET + 0x10280; -hv_fs_findfile = TEXT_OFFSET + 0x102a0; -hv_fs_fstat = TEXT_OFFSET + 0x102c0; -hv_fs_pread = TEXT_OFFSET + 0x102e0; -hv_physaddr_read64 = TEXT_OFFSET + 0x10300; -hv_physaddr_write64 = TEXT_OFFSET + 0x10320; -hv_get_command_line = TEXT_OFFSET + 0x10340; -hv_set_caching = TEXT_OFFSET + 0x10360; -hv_bzero_page = TEXT_OFFSET + 0x10380; -hv_register_message_state = TEXT_OFFSET + 0x103a0; -hv_send_message = TEXT_OFFSET + 0x103c0; -hv_receive_message = TEXT_OFFSET + 0x103e0; -hv_inquire_context = TEXT_OFFSET + 0x10400; -hv_start_all_tiles = TEXT_OFFSET + 0x10420; -hv_dev_open = TEXT_OFFSET + 0x10440; -hv_dev_close = TEXT_OFFSET + 0x10460; -hv_dev_pread = TEXT_OFFSET + 0x10480; -hv_dev_pwrite = TEXT_OFFSET + 0x104a0; -hv_dev_poll = TEXT_OFFSET + 0x104c0; -hv_dev_poll_cancel = TEXT_OFFSET + 0x104e0; -hv_dev_preada = TEXT_OFFSET + 0x10500; -hv_dev_pwritea = TEXT_OFFSET + 0x10520; -hv_flush_remote = TEXT_OFFSET + 0x10540; -hv_console_putc = TEXT_OFFSET + 0x10560; -hv_inquire_tiles = TEXT_OFFSET + 0x10580; -hv_confstr = TEXT_OFFSET + 0x105a0; -hv_reexec = TEXT_OFFSET + 0x105c0; -hv_set_command_line = TEXT_OFFSET + 0x105e0; -hv_clear_intr = TEXT_OFFSET + 0x10600; -hv_enable_intr = TEXT_OFFSET + 0x10620; -hv_disable_intr = TEXT_OFFSET + 0x10640; -hv_raise_intr = TEXT_OFFSET + 0x10660; -hv_trigger_ipi = TEXT_OFFSET + 0x10680; -hv_store_mapping = TEXT_OFFSET + 0x106a0; -hv_inquire_realpa = TEXT_OFFSET + 0x106c0; -hv_flush_all = TEXT_OFFSET + 0x106e0; -hv_get_ipi_pte = TEXT_OFFSET + 0x10700; -hv_glue_internals = TEXT_OFFSET + 0x10720; diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c new file mode 100644 index 00000000000..85c74ad2931 --- /dev/null +++ b/arch/tile/kernel/hvglue_trace.c @@ -0,0 +1,266 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +/* + * Pull in the hypervisor header so we declare all the ABI functions + * with the underscore versions, then undef the names so that we can + * provide our own wrapper versions. + */ +#define hv_init _hv_init +#define hv_install_context _hv_install_context +#define hv_sysconf _hv_sysconf +#define hv_get_rtc _hv_get_rtc +#define hv_set_rtc _hv_set_rtc +#define hv_flush_asid _hv_flush_asid +#define hv_flush_page _hv_flush_page +#define hv_flush_pages _hv_flush_pages +#define hv_restart _hv_restart +#define hv_halt _hv_halt +#define hv_power_off _hv_power_off +#define hv_inquire_physical _hv_inquire_physical +#define hv_inquire_memory_controller _hv_inquire_memory_controller +#define hv_inquire_virtual _hv_inquire_virtual +#define hv_inquire_asid _hv_inquire_asid +#define hv_nanosleep _hv_nanosleep +#define hv_console_read_if_ready _hv_console_read_if_ready +#define hv_console_write _hv_console_write +#define hv_downcall_dispatch _hv_downcall_dispatch +#define hv_inquire_topology _hv_inquire_topology +#define hv_fs_findfile _hv_fs_findfile +#define hv_fs_fstat _hv_fs_fstat +#define hv_fs_pread _hv_fs_pread +#define hv_physaddr_read64 _hv_physaddr_read64 +#define hv_physaddr_write64 _hv_physaddr_write64 +#define hv_get_command_line _hv_get_command_line +#define hv_set_caching _hv_set_caching +#define hv_bzero_page _hv_bzero_page +#define hv_register_message_state _hv_register_message_state +#define hv_send_message _hv_send_message +#define hv_receive_message _hv_receive_message +#define hv_inquire_context _hv_inquire_context +#define hv_start_all_tiles _hv_start_all_tiles +#define hv_dev_open _hv_dev_open +#define hv_dev_close _hv_dev_close +#define hv_dev_pread _hv_dev_pread +#define hv_dev_pwrite _hv_dev_pwrite +#define hv_dev_poll _hv_dev_poll +#define hv_dev_poll_cancel _hv_dev_poll_cancel +#define hv_dev_preada _hv_dev_preada +#define hv_dev_pwritea _hv_dev_pwritea +#define hv_flush_remote _hv_flush_remote +#define hv_console_putc _hv_console_putc +#define hv_inquire_tiles _hv_inquire_tiles +#define hv_confstr _hv_confstr +#define hv_reexec _hv_reexec +#define hv_set_command_line _hv_set_command_line +#define hv_clear_intr _hv_clear_intr +#define hv_enable_intr _hv_enable_intr +#define hv_disable_intr _hv_disable_intr +#define hv_raise_intr _hv_raise_intr +#define hv_trigger_ipi _hv_trigger_ipi +#define hv_store_mapping _hv_store_mapping +#define hv_inquire_realpa _hv_inquire_realpa +#define hv_flush_all _hv_flush_all +#define hv_get_ipi_pte _hv_get_ipi_pte +#define hv_set_pte_super_shift _hv_set_pte_super_shift +#define hv_console_set_ipi _hv_console_set_ipi +#include <hv/hypervisor.h> +#undef hv_init +#undef hv_install_context +#undef hv_sysconf +#undef hv_get_rtc +#undef hv_set_rtc +#undef hv_flush_asid +#undef hv_flush_page +#undef hv_flush_pages +#undef hv_restart +#undef hv_halt +#undef hv_power_off +#undef hv_inquire_physical +#undef hv_inquire_memory_controller +#undef hv_inquire_virtual +#undef hv_inquire_asid +#undef hv_nanosleep +#undef hv_console_read_if_ready +#undef hv_console_write +#undef hv_downcall_dispatch +#undef hv_inquire_topology +#undef hv_fs_findfile +#undef hv_fs_fstat +#undef hv_fs_pread +#undef hv_physaddr_read64 +#undef hv_physaddr_write64 +#undef hv_get_command_line +#undef hv_set_caching +#undef hv_bzero_page +#undef hv_register_message_state +#undef hv_send_message +#undef hv_receive_message +#undef hv_inquire_context +#undef hv_start_all_tiles +#undef hv_dev_open +#undef hv_dev_close +#undef hv_dev_pread +#undef hv_dev_pwrite +#undef hv_dev_poll +#undef hv_dev_poll_cancel +#undef hv_dev_preada +#undef hv_dev_pwritea +#undef hv_flush_remote +#undef hv_console_putc +#undef hv_inquire_tiles +#undef hv_confstr +#undef hv_reexec +#undef hv_set_command_line +#undef hv_clear_intr +#undef hv_enable_intr +#undef hv_disable_intr +#undef hv_raise_intr +#undef hv_trigger_ipi +#undef hv_store_mapping +#undef hv_inquire_realpa +#undef hv_flush_all +#undef hv_get_ipi_pte +#undef hv_set_pte_super_shift +#undef hv_console_set_ipi + +/* + * Provide macros based on <linux/syscalls.h> to provide a wrapper + * function that invokes the same function with an underscore prefix. + * We can't use the existing __SC_xxx macros because we need to + * support up to nine arguments rather than up to six, and also this + * way the file stands alone from possible changes in the + * implementation of <linux/syscalls.h>. + */ +#define HV_WRAP0(type, name) \ + type name(void); \ + type name(void) \ + { \ + return _##name(); \ + } +#define __HV_DECL1(t1, a1) t1 a1 +#define __HV_DECL2(t2, a2, ...) t2 a2, __HV_DECL1(__VA_ARGS__) +#define __HV_DECL3(t3, a3, ...) t3 a3, __HV_DECL2(__VA_ARGS__) +#define __HV_DECL4(t4, a4, ...) t4 a4, __HV_DECL3(__VA_ARGS__) +#define __HV_DECL5(t5, a5, ...) t5 a5, __HV_DECL4(__VA_ARGS__) +#define __HV_DECL6(t6, a6, ...) t6 a6, __HV_DECL5(__VA_ARGS__) +#define __HV_DECL7(t7, a7, ...) t7 a7, __HV_DECL6(__VA_ARGS__) +#define __HV_DECL8(t8, a8, ...) t8 a8, __HV_DECL7(__VA_ARGS__) +#define __HV_DECL9(t9, a9, ...) t9 a9, __HV_DECL8(__VA_ARGS__) +#define __HV_PASS1(t1, a1) a1 +#define __HV_PASS2(t2, a2, ...) a2, __HV_PASS1(__VA_ARGS__) +#define __HV_PASS3(t3, a3, ...) a3, __HV_PASS2(__VA_ARGS__) +#define __HV_PASS4(t4, a4, ...) a4, __HV_PASS3(__VA_ARGS__) +#define __HV_PASS5(t5, a5, ...) a5, __HV_PASS4(__VA_ARGS__) +#define __HV_PASS6(t6, a6, ...) a6, __HV_PASS5(__VA_ARGS__) +#define __HV_PASS7(t7, a7, ...) a7, __HV_PASS6(__VA_ARGS__) +#define __HV_PASS8(t8, a8, ...) a8, __HV_PASS7(__VA_ARGS__) +#define __HV_PASS9(t9, a9, ...) a9, __HV_PASS8(__VA_ARGS__) +#define HV_WRAPx(x, type, name, ...) \ + type name(__HV_DECL##x(__VA_ARGS__)); \ + type name(__HV_DECL##x(__VA_ARGS__)) \ + { \ + return _##name(__HV_PASS##x(__VA_ARGS__)); \ + } +#define HV_WRAP1(type, name, ...) HV_WRAPx(1, type, name, __VA_ARGS__) +#define HV_WRAP2(type, name, ...) HV_WRAPx(2, type, name, __VA_ARGS__) +#define HV_WRAP3(type, name, ...) HV_WRAPx(3, type, name, __VA_ARGS__) +#define HV_WRAP4(type, name, ...) HV_WRAPx(4, type, name, __VA_ARGS__) +#define HV_WRAP5(type, name, ...) HV_WRAPx(5, type, name, __VA_ARGS__) +#define HV_WRAP6(type, name, ...) HV_WRAPx(6, type, name, __VA_ARGS__) +#define HV_WRAP7(type, name, ...) HV_WRAPx(7, type, name, __VA_ARGS__) +#define HV_WRAP8(type, name, ...) HV_WRAPx(8, type, name, __VA_ARGS__) +#define HV_WRAP9(type, name, ...) HV_WRAPx(9, type, name, __VA_ARGS__) + +/* List all the hypervisor API functions. */ +HV_WRAP4(void, hv_init, HV_VersionNumber, interface_version_number, + int, chip_num, int, chip_rev_num, int, client_pl) +HV_WRAP1(long, hv_sysconf, HV_SysconfQuery, query) +HV_WRAP3(int, hv_confstr, HV_ConfstrQuery, query, HV_VirtAddr, buf, int, len) +#if CHIP_HAS_IPI() +HV_WRAP3(int, hv_get_ipi_pte, HV_Coord, tile, int, pl, HV_PTE*, pte) +HV_WRAP3(int, hv_console_set_ipi, int, ipi, int, event, HV_Coord, coord); +#else +HV_WRAP1(void, hv_enable_intr, HV_IntrMask, enab_mask) +HV_WRAP1(void, hv_disable_intr, HV_IntrMask, disab_mask) +HV_WRAP1(void, hv_clear_intr, HV_IntrMask, clear_mask) +HV_WRAP1(void, hv_raise_intr, HV_IntrMask, raise_mask) +HV_WRAP2(HV_Errno, hv_trigger_ipi, HV_Coord, tile, int, interrupt) +#endif /* !CHIP_HAS_IPI() */ +HV_WRAP3(int, hv_store_mapping, HV_VirtAddr, va, unsigned int, len, + HV_PhysAddr, pa) +HV_WRAP2(HV_PhysAddr, hv_inquire_realpa, HV_PhysAddr, cpa, unsigned int, len) +HV_WRAP0(HV_RTCTime, hv_get_rtc) +HV_WRAP1(void, hv_set_rtc, HV_RTCTime, time) +HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access, + HV_ASID, asid, __hv32, flags) +HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count) +HV_WRAP0(HV_Context, hv_inquire_context) +HV_WRAP1(int, hv_flush_asid, HV_ASID, asid) +HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size) +HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size, + unsigned long, size) +HV_WRAP1(int, hv_flush_all, int, preserve_global) +HV_WRAP2(void, hv_restart, HV_VirtAddr, cmd, HV_VirtAddr, args) +HV_WRAP0(void, hv_halt) +HV_WRAP0(void, hv_power_off) +HV_WRAP1(int, hv_reexec, HV_PhysAddr, entry) +HV_WRAP0(HV_Topology, hv_inquire_topology) +HV_WRAP3(HV_Errno, hv_inquire_tiles, HV_InqTileSet, set, HV_VirtAddr, cpumask, + int, length) +HV_WRAP1(HV_PhysAddrRange, hv_inquire_physical, int, idx) +HV_WRAP2(HV_MemoryControllerInfo, hv_inquire_memory_controller, HV_Coord, coord, + int, controller) +HV_WRAP1(HV_VirtAddrRange, hv_inquire_virtual, int, idx) +HV_WRAP1(HV_ASIDRange, hv_inquire_asid, int, idx) +HV_WRAP1(void, hv_nanosleep, int, nanosecs) +HV_WRAP0(int, hv_console_read_if_ready) +HV_WRAP1(void, hv_console_putc, int, byte) +HV_WRAP2(int, hv_console_write, HV_VirtAddr, bytes, int, len) +HV_WRAP0(void, hv_downcall_dispatch) +HV_WRAP1(int, hv_fs_findfile, HV_VirtAddr, filename) +HV_WRAP1(HV_FS_StatInfo, hv_fs_fstat, int, inode) +HV_WRAP4(int, hv_fs_pread, int, inode, HV_VirtAddr, buf, + int, length, int, offset) +HV_WRAP2(unsigned long long, hv_physaddr_read64, HV_PhysAddr, addr, + HV_PTE, access) +HV_WRAP3(void, hv_physaddr_write64, HV_PhysAddr, addr, HV_PTE, access, + unsigned long long, val) +HV_WRAP2(int, hv_get_command_line, HV_VirtAddr, buf, int, length) +HV_WRAP2(HV_Errno, hv_set_command_line, HV_VirtAddr, buf, int, length) +HV_WRAP1(void, hv_set_caching, unsigned long, bitmask) +HV_WRAP2(void, hv_bzero_page, HV_VirtAddr, va, unsigned int, size) +HV_WRAP1(HV_Errno, hv_register_message_state, HV_MsgState*, msgstate) +HV_WRAP4(int, hv_send_message, HV_Recipient *, recips, int, nrecip, + HV_VirtAddr, buf, int, buflen) +HV_WRAP3(HV_RcvMsgInfo, hv_receive_message, HV_MsgState, msgstate, + HV_VirtAddr, buf, int, buflen) +HV_WRAP0(void, hv_start_all_tiles) +HV_WRAP2(int, hv_dev_open, HV_VirtAddr, name, __hv32, flags) +HV_WRAP1(int, hv_dev_close, int, devhdl) +HV_WRAP5(int, hv_dev_pread, int, devhdl, __hv32, flags, HV_VirtAddr, va, + __hv32, len, __hv64, offset) +HV_WRAP5(int, hv_dev_pwrite, int, devhdl, __hv32, flags, HV_VirtAddr, va, + __hv32, len, __hv64, offset) +HV_WRAP3(int, hv_dev_poll, int, devhdl, __hv32, events, HV_IntArg, intarg) +HV_WRAP1(int, hv_dev_poll_cancel, int, devhdl) +HV_WRAP6(int, hv_dev_preada, int, devhdl, __hv32, flags, __hv32, sgl_len, + HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg) +HV_WRAP6(int, hv_dev_pwritea, int, devhdl, __hv32, flags, __hv32, sgl_len, + HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg) +HV_WRAP9(int, hv_flush_remote, HV_PhysAddr, cache_pa, + unsigned long, cache_control, unsigned long*, cache_cpumask, + HV_VirtAddr, tlb_va, unsigned long, tlb_length, + unsigned long, tlb_pgsize, unsigned long*, tlb_cpumask, + HV_Remote_ASID*, asids, int, asidcount) diff --git a/arch/tile/kernel/init_task.c b/arch/tile/kernel/init_task.c deleted file mode 100644 index 928b3187066..00000000000 --- a/arch/tile/kernel/init_task.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/init_task.h> -#include <linux/mqueue.h> -#include <linux/module.h> -#include <linux/start_kernel.h> -#include <linux/uaccess.h> - -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - -/* - * Initial thread structure. - * - * We need to make sure that this is THREAD_SIZE aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union __init_task_data = { - INIT_THREAD_INFO(init_task) -}; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); -EXPORT_SYMBOL(init_task); - -/* - * per-CPU stack and boot info. - */ -DEFINE_PER_CPU(unsigned long, boot_sp) = - (unsigned long)init_stack + THREAD_SIZE; - -#ifdef CONFIG_SMP -DEFINE_PER_CPU(unsigned long, boot_pc) = (unsigned long)start_kernel; -#else -/* - * The variable must be __initdata since it references __init code. - * With CONFIG_SMP it is per-cpu data, which is exempt from validation. - */ -unsigned long __initdata boot_pc = (unsigned long)start_kernel; -#endif diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index aecc8ed5f39..cdbda45a4e4 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S @@ -28,20 +28,10 @@ #include <arch/interrupts.h> #include <arch/spr_def.h> -#ifdef CONFIG_PREEMPT -# error "No support for kernel preemption currently" -#endif - #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg) #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR) -#if !CHIP_HAS_WH64() - /* By making this an empty macro, we can use wh64 in the code. */ - .macro wh64 reg - .endm -#endif - .macro push_reg reg, ptr=sp, delta=-4 { sw \ptr, \reg @@ -189,7 +179,7 @@ intvec_\vecname: * point sp at the top aligned address on the actual stack page. */ mfspr r0, SPR_SYSTEM_SAVE_K_0 - mm r0, r0, zero, LOG2_THREAD_SIZE, 31 + mm r0, r0, zero, LOG2_NR_CPU_IDS, 31 0: /* @@ -207,6 +197,9 @@ intvec_\vecname: * cache line 1: r14...r29 * cache line 0: 2 x frame, r0..r13 */ +#if STACK_TOP_DELTA != 64 +#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs() +#endif andi r0, r0, -64 /* @@ -320,24 +313,20 @@ intvec_\vecname: movei r3, 0 } .else - .ifc \c_routine, op_handle_perf_interrupt + .ifc \c_routine, handle_perf_interrupt { mfspr r2, PERF_COUNT_STS movei r3, -1 /* not used, but set for consistency */ } .else -#if CHIP_HAS_AUX_PERF_COUNTERS() - .ifc \c_routine, op_handle_aux_perf_interrupt + .ifc \c_routine, handle_perf_interrupt { mfspr r2, AUX_PERF_COUNT_STS movei r3, -1 /* not used, but set for consistency */ } .else -#endif movei r3, 0 -#if CHIP_HAS_AUX_PERF_COUNTERS() .endif -#endif .endif .endif .endif @@ -354,7 +343,7 @@ intvec_\vecname: #ifdef __COLLECT_LINKER_FEEDBACK__ .pushsection .text.intvec_feedback,"ax" .org (\vecnum << 5) - FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8) + FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8) jrp lr .popsection #endif @@ -468,7 +457,7 @@ intvec_\vecname: } { auli r21, r21, ha16(__per_cpu_offset) - mm r20, r20, zero, 0, LOG2_THREAD_SIZE-1 + mm r20, r20, zero, 0, LOG2_NR_CPU_IDS-1 } s2a r20, r20, r21 lw tp, r20 @@ -562,7 +551,6 @@ intvec_\vecname: .endif mtspr INTERRUPT_CRITICAL_SECTION, zero -#if CHIP_HAS_WH64() /* * Prepare the first 256 stack bytes to be rapidly accessible * without having to fetch the background data. We don't really @@ -583,7 +571,6 @@ intvec_\vecname: addi r52, r52, -64 } wh64 r52 -#endif #ifdef CONFIG_TRACE_IRQFLAGS .ifnc \function,handle_nmi @@ -762,7 +749,7 @@ intvec_\vecname: .macro dc_dispatch vecnum, vecname .org (\vecnum << 8) intvec_\vecname: - j hv_downcall_dispatch + j _hv_downcall_dispatch ENDPROC(intvec_\vecname) .endm @@ -799,6 +786,10 @@ handle_interrupt: * This routine takes a boolean in r30 indicating if this is an NMI. * If so, we also expect a boolean in r31 indicating whether to * re-enable the oprofile interrupts. + * + * Note that .Lresume_userspace is jumped to directly in several + * places, and we need to make sure r30 is set correctly in those + * callers as well. */ STD_ENTRY(interrupt_return) /* If we're resuming to kernel space, don't check thread flags. */ @@ -808,17 +799,37 @@ STD_ENTRY(interrupt_return) } lw r29, r29 andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + bzt r29, .Lresume_userspace + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption. */ + GET_THREAD_INFO(r29) + addli r28, r29, THREAD_INFO_FLAGS_OFFSET { - bzt r29, .Lresume_userspace - PTREGS_PTR(r29, PTREGS_OFFSET_PC) + lw r28, r28 + addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET } + { + andi r28, r28, _TIF_NEED_RESCHED + lw r29, r29 + } + bzt r28, 1f + bnz r29, 1f + /* Disable interrupts explicitly for preemption. */ + IRQ_DISABLE(r20,r21) + TRACE_IRQS_OFF + jal preempt_schedule_irq + FEEDBACK_REENTER(interrupt_return) +1: +#endif /* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */ { - lw r28, r29 + PTREGS_PTR(r29, PTREGS_OFFSET_PC) moveli r27, lo16(_cpu_idle_nap) } { + lw r28, r29 auli r27, r27, ha16(_cpu_idle_nap) } { @@ -835,6 +846,18 @@ STD_ENTRY(interrupt_return) FEEDBACK_REENTER(interrupt_return) /* + * Use r33 to hold whether we have already loaded the callee-saves + * into ptregs. We don't want to do it twice in this loop, since + * then we'd clobber whatever changes are made by ptrace, etc. + * Get base of stack in r32. + */ + { + GET_THREAD_INFO(r32) + movei r33, 0 + } + +.Lretry_work_pending: + /* * Disable interrupts so as to make sure we don't * miss an interrupt that sets any of the thread flags (like * need_resched or sigpending) between sampling and the iret. @@ -844,9 +867,6 @@ STD_ENTRY(interrupt_return) IRQ_DISABLE(r20, r21) TRACE_IRQS_OFF /* Note: clobbers registers r0-r29 */ - /* Get base of stack in r32; note r30/31 are used as arguments here. */ - GET_THREAD_INFO(r32) - /* Check to see if there is any work to do before returning to user. */ { @@ -862,16 +882,18 @@ STD_ENTRY(interrupt_return) /* * Make sure we have all the registers saved for signal - * handling or single-step. Call out to C code to figure out - * exactly what we need to do for each flag bit, then if - * necessary, reload the flags and recheck. + * handling, notify-resume, or single-step. Call out to C + * code to figure out exactly what we need to do for each flag bit, + * then if necessary, reload the flags and recheck. */ - push_extra_callee_saves r0 { PTREGS_PTR(r0, PTREGS_OFFSET_BASE) - jal do_work_pending + bnz r33, 1f } - bnz r0, .Lresume_userspace + push_extra_callee_saves r0 + movei r33, 1 +1: jal do_work_pending + bnz r0, .Lretry_work_pending /* * In the NMI case we @@ -924,6 +946,13 @@ STD_ENTRY(interrupt_return) bzt r30, .Lrestore_regs 3: + /* We are relying on INT_PERF_COUNT at 33, and AUX_PERF_COUNT at 48 */ + { + moveli r0, lo16(1 << (INT_PERF_COUNT - 32)) + bz r31, .Lrestore_regs + } + auli r0, r0, ha16(1 << (INT_AUX_PERF_COUNT - 32)) + mtspr SPR_INTERRUPT_MASK_RESET_K_1, r0 /* * We now commit to returning from this interrupt, since we will be @@ -1149,6 +1178,10 @@ handle_nmi: PTREGS_PTR(r0, PTREGS_OFFSET_BASE) } FEEDBACK_REENTER(handle_nmi) + { + movei r30, 1 + seq r31, r0, zero + } j interrupt_return STD_ENDPROC(handle_nmi) @@ -1176,15 +1209,20 @@ handle_syscall: add r20, r20, tp lw r21, r20 addi r21, r21, 1 - sw r20, r21 + { + sw r20, r21 + GET_THREAD_INFO(r31) + } /* Trace syscalls, if requested. */ - GET_THREAD_INFO(r31) addi r31, r31, THREAD_INFO_FLAGS_OFFSET lw r30, r31 andi r30, r30, _TIF_SYSCALL_TRACE bzt r30, .Lrestore_syscall_regs - jal do_syscall_trace + { + PTREGS_PTR(r0, PTREGS_OFFSET_BASE) + jal do_syscall_trace_enter + } FEEDBACK_REENTER(handle_syscall) /* @@ -1235,9 +1273,15 @@ handle_syscall: lw r30, r31 andi r30, r30, _TIF_SYSCALL_TRACE bzt r30, 1f - jal do_syscall_trace + { + PTREGS_PTR(r0, PTREGS_OFFSET_BASE) + jal do_syscall_trace_exit + } FEEDBACK_REENTER(handle_syscall) -1: j .Lresume_userspace /* jump into middle of interrupt_return */ +1: { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } .Linvalid_syscall: /* Report an invalid syscall back to the user program */ @@ -1246,7 +1290,10 @@ handle_syscall: movei r28, -ENOSYS } sw r29, r28 - j .Lresume_userspace /* jump into middle of interrupt_return */ + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } STD_ENDPROC(handle_syscall) /* Return the address for oprofile to suppress in backtraces. */ @@ -1262,9 +1309,27 @@ STD_ENTRY(ret_from_fork) jal sim_notify_fork jal schedule_tail FEEDBACK_REENTER(ret_from_fork) - j .Lresume_userspace /* jump into middle of interrupt_return */ + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } STD_ENDPROC(ret_from_fork) +STD_ENTRY(ret_from_kernel_thread) + jal sim_notify_fork + jal schedule_tail + FEEDBACK_REENTER(ret_from_fork) + { + move r0, r31 + jalr r30 + } + FEEDBACK_REENTER(ret_from_kernel_thread) + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } + STD_ENDPROC(ret_from_kernel_thread) + /* * Code for ill interrupt. */ @@ -1349,7 +1414,10 @@ handle_ill: 3: /* set PC and continue */ lw r26, r24 - sw r28, r26 + { + sw r28, r26 + GET_THREAD_INFO(r0) + } /* * Clear TIF_SINGLESTEP to prevent recursion if we execute an ill. @@ -1357,7 +1425,6 @@ handle_ill: * need to clear it here and can't really impose on all other arches. * So what's another write between friends? */ - GET_THREAD_INFO(r0) addi r1, r0, THREAD_INFO_FLAGS_OFFSET { @@ -1371,12 +1438,14 @@ handle_ill: { lw r0, r0 /* indirect thru thread_info to get task_info*/ addi r1, sp, C_ABI_SAVE_AREA_SIZE /* put ptregs pointer into r1 */ - move r2, zero /* load error code into r2 */ } jal send_sigtrap /* issue a SIGTRAP */ FEEDBACK_REENTER(handle_ill) - j .Lresume_userspace /* jump into middle of interrupt_return */ + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } .Ldispatch_normal_ill: { @@ -1406,15 +1475,6 @@ STD_ENTRY_LOCAL(bad_intr) panic "Unhandled interrupt %#x: PC %#lx" STD_ENDPROC(bad_intr) -/* Put address of pt_regs in reg and jump. */ -#define PTREGS_SYSCALL(x, reg) \ - STD_ENTRY(_##x); \ - { \ - PTREGS_PTR(reg, PTREGS_OFFSET_BASE); \ - j x \ - }; \ - STD_ENDPROC(_##x) - /* * Special-case sigreturn to not write r0 to the stack on return. * This is technically more efficient, but it also avoids difficulties @@ -1430,12 +1490,9 @@ STD_ENTRY_LOCAL(bad_intr) }; \ STD_ENDPROC(_##x) -PTREGS_SYSCALL(sys_execve, r3) -PTREGS_SYSCALL(sys_sigaltstack, r2) PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0) -PTREGS_SYSCALL(sys_cmpxchg_badaddr, r1) -/* Save additional callee-saves to pt_regs, put address in r4 and jump. */ +/* Save additional callee-saves to pt_regs and jump to standard function. */ STD_ENTRY(_sys_clone) push_extra_callee_saves r4 j sys_clone @@ -1478,12 +1535,10 @@ STD_ENTRY(_sys_clone) __HEAD .align 64 /* Align much later jump on the start of a cache line. */ -#if !ATOMIC_LOCKS_FOUND_VIA_TABLE() nop #if PAGE_SIZE >= 0x10000 nop #endif -#endif ENTRY(sys_cmpxchg) /* @@ -1517,45 +1572,6 @@ ENTRY(sys_cmpxchg) # error Code here assumes PAGE_OFFSET can be loaded with just hi16() #endif -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - { - /* Check for unaligned input. */ - bnz sp, .Lcmpxchg_badaddr - mm r25, r0, zero, 3, PAGE_SHIFT-1 - } - { - crc32_32 r25, zero, r25 - moveli r21, lo16(atomic_lock_ptr) - } - { - auli r21, r21, ha16(atomic_lock_ptr) - auli r23, zero, hi16(PAGE_OFFSET) /* hugepage-aligned */ - } - { - shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT - slt_u r23, r0, r23 - lw r26, r0 /* see comment in the "#else" for the "lw r26". */ - } - { - s2a r21, r20, r21 - bbns r23, .Lcmpxchg_badaddr - } - { - lw r21, r21 - seqi r23, TREG_SYSCALL_NR_NAME, __NR_FAST_cmpxchg64 - andi r25, r25, ATOMIC_HASH_L2_SIZE - 1 - } - { - /* Branch away at this point if we're doing a 64-bit cmpxchg. */ - bbs r23, .Lcmpxchg64 - andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */ - } - { - s2a ATOMIC_LOCK_REG_NAME, r25, r21 - j .Lcmpxchg32_tns /* see comment in the #else for the jump. */ - } - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ { /* Check for unaligned input. */ bnz sp, .Lcmpxchg_badaddr @@ -1569,7 +1585,7 @@ ENTRY(sys_cmpxchg) * Because of C pointer arithmetic, we want to compute this: * * ((char*)atomic_locks + - * (((r0 >> 3) & (1 << (ATOMIC_HASH_SIZE - 1))) << 2)) + * (((r0 >> 3) & ((1 << ATOMIC_HASH_SHIFT) - 1)) << 2)) * * Instead of two shifts we just ">> 1", and use 'mm' * to ignore the low and high bits we don't want. @@ -1580,12 +1596,9 @@ ENTRY(sys_cmpxchg) /* * Ensure that the TLB is loaded before we take out the lock. - * On tilepro, this will start fetching the value all the way - * into our L1 as well (and if it gets modified before we - * grab the lock, it will be invalidated from our cache - * before we reload it). On tile64, we'll start fetching it - * into our L1 if we're the home, and if we're not, we'll - * still at least start fetching it into the home's L2. + * This will start fetching the value all the way into our L1 + * as well (and if it gets modified before we grab the lock, + * it will be invalidated from our cache before we reload it). */ lw r26, r0 } @@ -1628,8 +1641,6 @@ ENTRY(sys_cmpxchg) j .Lcmpxchg32_tns } -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - /* Symbol for do_page_fault_ics() to use to compare against the PC. */ .global __sys_cmpxchg_grab_lock __sys_cmpxchg_grab_lock: @@ -1767,9 +1778,6 @@ __sys_cmpxchg_grab_lock: .align 64 .Lcmpxchg64: { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - s2a ATOMIC_LOCK_REG_NAME, r25, r21 -#endif bzt r23, .Lcmpxchg64_tns } j .Lcmpxchg_badaddr @@ -1835,11 +1843,12 @@ int_unalign: push_extra_callee_saves r0 j do_trap -/* Include .intrpt1 array of interrupt vectors */ - .section ".intrpt1", "ax" +/* Include .intrpt array of interrupt vectors */ + .section ".intrpt", "ax" -#define op_handle_perf_interrupt bad_intr -#define op_handle_aux_perf_interrupt bad_intr +#ifndef CONFIG_USE_PMC +#define handle_perf_interrupt bad_intr +#endif #ifndef CONFIG_HARDWALL #define do_hardwall_trap bad_intr @@ -1880,7 +1889,7 @@ int_unalign: int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr int_hand INT_UDN_AVAIL, UDN_AVAIL, bad_intr int_hand INT_PERF_COUNT, PERF_COUNT, \ - op_handle_perf_interrupt, handle_nmi + handle_perf_interrupt, handle_nmi int_hand INT_INTCTRL_3, INTCTRL_3, bad_intr #if CONFIG_KERNEL_PL == 2 dc_dispatch INT_INTCTRL_2, INTCTRL_2 @@ -1904,10 +1913,8 @@ int_unalign: do_page_fault int_hand INT_SN_CPL, SN_CPL, bad_intr int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap -#if CHIP_HAS_AUX_PERF_COUNTERS() int_hand INT_AUX_PERF_COUNT, AUX_PERF_COUNT, \ - op_handle_aux_perf_interrupt, handle_nmi -#endif + handle_perf_interrupt, handle_nmi /* Synthetic interrupt delivered only by the simulator */ int_hand INT_BREAKPOINT, BREAKPOINT, do_breakpoint diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S index 79c93e10ba2..5b67efcecab 100644 --- a/arch/tile/kernel/intvec_64.S +++ b/arch/tile/kernel/intvec_64.S @@ -17,24 +17,33 @@ #include <linux/linkage.h> #include <linux/errno.h> #include <linux/unistd.h> +#include <linux/init.h> #include <asm/ptrace.h> #include <asm/thread_info.h> #include <asm/irqflags.h> #include <asm/asm-offsets.h> #include <asm/types.h> +#include <asm/traps.h> +#include <asm/signal.h> #include <hv/hypervisor.h> #include <arch/abi.h> #include <arch/interrupts.h> #include <arch/spr_def.h> -#ifdef CONFIG_PREEMPT -# error "No support for kernel preemption currently" -#endif - #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg) #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR) +#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2 +/* + * Set "result" non-zero if ex1 holds the PL of the kernel + * (with or without ICS being set). Note this works only + * because we never find the PL at level 3. + */ +# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL +#else +# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL +#endif .macro push_reg reg, ptr=sp, delta=-8 { @@ -97,6 +106,185 @@ } .endm + /* + * Unalign data exception fast handling: In order to handle + * unaligned data access, a fast JIT version is generated and stored + * in a specific area in user space. We first need to do a quick poke + * to see if the JIT is available. We use certain bits in the fault + * PC (3 to 9 is used for 16KB page size) as index to address the JIT + * code area. The first 64bit word is the fault PC, and the 2nd one is + * the fault bundle itself. If these 2 words both match, then we + * directly "iret" to JIT code. If not, a slow path is invoked to + * generate new JIT code. Note: the current JIT code WILL be + * overwritten if it existed. So, ideally we can handle 128 unalign + * fixups via JIT. For lookup efficiency and to effectively support + * tight loops with multiple unaligned reference, a simple + * direct-mapped cache is used. + * + * SPR_EX_CONTEXT_K_0 is modified to return to JIT code. + * SPR_EX_CONTEXT_K_1 has ICS set. + * SPR_EX_CONTEXT_0_0 is setup to user program's next PC. + * SPR_EX_CONTEXT_0_1 = 0. + */ + .macro int_hand_unalign_fast vecnum, vecname + .org (\vecnum << 8) +intvec_\vecname: + /* Put r3 in SPR_SYSTEM_SAVE_K_1. */ + mtspr SPR_SYSTEM_SAVE_K_1, r3 + + mfspr r3, SPR_EX_CONTEXT_K_1 + /* + * Examine if exception comes from user without ICS set. + * If not, just go directly to the slow path. + */ + bnez r3, hand_unalign_slow_nonuser + + mfspr r3, SPR_SYSTEM_SAVE_K_0 + + /* Get &thread_info->unalign_jit_tmp[0] in r3. */ + bfexts r3, r3, 0, CPU_SHIFT-1 + mm r3, zero, LOG2_THREAD_SIZE, 63 + addli r3, r3, THREAD_INFO_UNALIGN_JIT_TMP_OFFSET + + /* + * Save r0, r1, r2 into thread_info array r3 points to + * from low to high memory in order. + */ + st_add r3, r0, 8 + st_add r3, r1, 8 + { + st_add r3, r2, 8 + andi r2, sp, 7 + } + + /* Save stored r3 value so we can revert it on a page fault. */ + mfspr r1, SPR_SYSTEM_SAVE_K_1 + st r3, r1 + + { + /* Generate a SIGBUS if sp is not 8-byte aligned. */ + bnez r2, hand_unalign_slow_badsp + } + + /* + * Get the thread_info in r0; load r1 with pc. Set the low bit of sp + * as an indicator to the page fault code in case we fault. + */ + { + ori sp, sp, 1 + mfspr r1, SPR_EX_CONTEXT_K_0 + } + + /* Add the jit_info offset in thread_info; extract r1 [3:9] into r2. */ + { + addli r0, r3, THREAD_INFO_UNALIGN_JIT_BASE_OFFSET - \ + (THREAD_INFO_UNALIGN_JIT_TMP_OFFSET + (3 * 8)) + bfextu r2, r1, 3, (2 + PAGE_SHIFT - UNALIGN_JIT_SHIFT) + } + + /* Load the jit_info; multiply r2 by 128. */ + { + ld r0, r0 + shli r2, r2, UNALIGN_JIT_SHIFT + } + + /* + * If r0 is NULL, the JIT page is not mapped, so go to slow path; + * add offset r2 to r0 at the same time. + */ + { + beqz r0, hand_unalign_slow + add r2, r0, r2 + } + + /* + * We are loading from userspace (both the JIT info PC and + * instruction word, and the instruction word we executed) + * and since either could fault while holding the interrupt + * critical section, we must tag this region and check it in + * do_page_fault() to handle it properly. + */ +ENTRY(__start_unalign_asm_code) + + /* Load first word of JIT in r0 and increment r2 by 8. */ + ld_add r0, r2, 8 + + /* + * Compare the PC with the 1st word in JIT; load the fault bundle + * into r1. + */ + { + cmpeq r0, r0, r1 + ld r1, r1 + } + + /* Go to slow path if PC doesn't match. */ + beqz r0, hand_unalign_slow + + /* + * Load the 2nd word of JIT, which is supposed to be the fault + * bundle for a cache hit. Increment r2; after this bundle r2 will + * point to the potential start of the JIT code we want to run. + */ + ld_add r0, r2, 8 + + /* No further accesses to userspace are done after this point. */ +ENTRY(__end_unalign_asm_code) + + /* Compare the real bundle with what is saved in the JIT area. */ + { + cmpeq r0, r1, r0 + mtspr SPR_EX_CONTEXT_0_1, zero + } + + /* Go to slow path if the fault bundle does not match. */ + beqz r0, hand_unalign_slow + + /* + * A cache hit is found. + * r2 points to start of JIT code (3rd word). + * r0 is the fault pc. + * r1 is the fault bundle. + * Reset the low bit of sp. + */ + { + mfspr r0, SPR_EX_CONTEXT_K_0 + andi sp, sp, ~1 + } + + /* Write r2 into EX_CONTEXT_K_0 and increment PC. */ + { + mtspr SPR_EX_CONTEXT_K_0, r2 + addi r0, r0, 8 + } + + /* + * Set ICS on kernel EX_CONTEXT_K_1 in order to "iret" to + * user with ICS set. This way, if the JIT fixup causes another + * unalign exception (which shouldn't be possible) the user + * process will be terminated with SIGBUS. Also, our fixup will + * run without interleaving with external interrupts. + * Each fixup is at most 14 bundles, so it won't hold ICS for long. + */ + { + movei r1, PL_ICS_EX1(USER_PL, 1) + mtspr SPR_EX_CONTEXT_0_0, r0 + } + + { + mtspr SPR_EX_CONTEXT_K_1, r1 + addi r3, r3, -(3 * 8) + } + + /* Restore r0..r3. */ + ld_add r0, r3, 8 + ld_add r1, r3, 8 + ld_add r2, r3, 8 + ld r3, r3 + + iret + ENDPROC(intvec_\vecname) + .endm #ifdef __COLLECT_LINKER_FEEDBACK__ .pushsection .text.intvec_feedback,"ax" @@ -117,15 +305,21 @@ intvec_feedback: * The "processing" argument specifies the code for processing * the interrupt. Defaults to "handle_interrupt". */ - .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt - .org (\vecnum << 8) + .macro __int_hand vecnum, vecname, c_routine,processing=handle_interrupt intvec_\vecname: /* Temporarily save a register so we have somewhere to work. */ mtspr SPR_SYSTEM_SAVE_K_1, r0 mfspr r0, SPR_EX_CONTEXT_K_1 - andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + /* + * The unalign data fastpath code sets the low bit in sp to + * force us to reset it here on fault. + */ + { + blbs sp, 2f + IS_KERNEL_EX1(r0, r0) + } .ifc \vecnum, INT_DOUBLE_FAULT /* @@ -175,15 +369,15 @@ intvec_\vecname: } .endif - +2: /* - * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and - * the current stack top in the higher bits. So we recover - * our stack top by just masking off the low bits, then + * SYSTEM_SAVE_K_0 holds the cpu number in the high bits, and + * the current stack top in the lower bits. So we recover + * our starting stack value by sign-extending the low bits, then * point sp at the top aligned address on the actual stack page. */ mfspr r0, SPR_SYSTEM_SAVE_K_0 - mm r0, zero, LOG2_THREAD_SIZE, 63 + bfexts r0, r0, 0, CPU_SHIFT-1 0: /* @@ -205,6 +399,9 @@ intvec_\vecname: * cache line 1: r6...r13 * cache line 0: 2 x frame, r0..r5 */ +#if STACK_TOP_DELTA != 64 +#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs() +#endif andi r0, r0, -64 /* @@ -219,7 +416,9 @@ intvec_\vecname: * This routine saves just the first four registers, plus the * stack context so we can do proper backtracing right away, * and defers to handle_interrupt to save the rest. - * The backtracer needs pc, ex1, lr, sp, r52, and faultnum. + * The backtracer needs pc, ex1, lr, sp, r52, and faultnum, + * and needs sp set to its final location at the bottom of + * the stack frame. */ addli r0, r0, PTREGS_OFFSET_LR - (PTREGS_SIZE + KSTK_PTREGS_GAP) wh64 r0 /* cache line 7 */ @@ -302,7 +501,7 @@ intvec_\vecname: mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */ .else .ifc \vecnum, INT_ILL_TRANS - mfspr r2, ILL_TRANS_REASON + mfspr r2, ILL_VA_PC .else .ifc \vecnum, INT_DOUBLE_FAULT mfspr r2, SPR_SYSTEM_SAVE_K_2 /* double fault info from HV */ @@ -310,14 +509,12 @@ intvec_\vecname: .ifc \c_routine, do_trap mfspr r2, GPV_REASON .else - .ifc \c_routine, op_handle_perf_interrupt + .ifc \c_routine, handle_perf_interrupt mfspr r2, PERF_COUNT_STS -#if CHIP_HAS_AUX_PERF_COUNTERS() .else - .ifc \c_routine, op_handle_aux_perf_interrupt + .ifc \c_routine, handle_perf_interrupt mfspr r2, AUX_PERF_COUNT_STS .endif -#endif .endif .endif .endif @@ -336,7 +533,7 @@ intvec_\vecname: #ifdef __COLLECT_LINKER_FEEDBACK__ .pushsection .text.intvec_feedback,"ax" .org (\vecnum << 5) - FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8) + FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8) jrp lr .popsection #endif @@ -449,31 +646,15 @@ intvec_\vecname: push_reg r5, r52 st r52, r4 - /* Load tp with our per-cpu offset. */ -#ifdef CONFIG_SMP - { - mfspr r20, SPR_SYSTEM_SAVE_K_0 - moveli r21, hw2_last(__per_cpu_offset) - } - { - shl16insli r21, r21, hw1(__per_cpu_offset) - bfextu r20, r20, 0, LOG2_THREAD_SIZE-1 - } - shl16insli r21, r21, hw0(__per_cpu_offset) - shl3add r20, r20, r21 - ld tp, r20 -#else - move tp, zero -#endif - /* * If we will be returning to the kernel, we will need to * reset the interrupt masks to the state they had before. - * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled. + * Set DISABLE_IRQ in flags iff we came from kernel pl with + * irqs disabled. */ mfspr r32, SPR_EX_CONTEXT_K_1 { - andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(r22, r22) PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS) } beqzt r32, 1f /* zero if from user space */ @@ -488,6 +669,44 @@ intvec_\vecname: .endif st r21, r32 + /* + * we've captured enough state to the stack (including in + * particular our EX_CONTEXT state) that we can now release + * the interrupt critical section and replace it with our + * standard "interrupts disabled" mask value. This allows + * synchronous interrupts (and profile interrupts) to punch + * through from this point onwards. + * + * It's important that no code before this point touch memory + * other than our own stack (to keep the invariant that this + * is all that gets touched under ICS), and that no code after + * this point reference any interrupt-specific SPR, in particular + * the EX_CONTEXT_K_ values. + */ + .ifc \function,handle_nmi + IRQ_DISABLE_ALL(r20) + .else + IRQ_DISABLE(r20, r21) + .endif + mtspr INTERRUPT_CRITICAL_SECTION, zero + + /* Load tp with our per-cpu offset. */ +#ifdef CONFIG_SMP + { + mfspr r20, SPR_SYSTEM_SAVE_K_0 + moveli r21, hw2_last(__per_cpu_offset) + } + { + shl16insli r21, r21, hw1(__per_cpu_offset) + bfextu r20, r20, CPU_SHIFT, 63 + } + shl16insli r21, r21, hw0(__per_cpu_offset) + shl3add r20, r20, r21 + ld tp, r20 +#else + move tp, zero +#endif + #ifdef __COLLECT_LINKER_FEEDBACK__ /* * Notify the feedback routines that we were in the @@ -512,21 +731,6 @@ intvec_\vecname: #endif /* - * we've captured enough state to the stack (including in - * particular our EX_CONTEXT state) that we can now release - * the interrupt critical section and replace it with our - * standard "interrupts disabled" mask value. This allows - * synchronous interrupts (and profile interrupts) to punch - * through from this point onwards. - */ - .ifc \function,handle_nmi - IRQ_DISABLE_ALL(r20) - .else - IRQ_DISABLE(r20, r21) - .endif - mtspr INTERRUPT_CRITICAL_SECTION, zero - - /* * Prepare the first 256 stack bytes to be rapidly accessible * without having to fetch the background data. */ @@ -576,7 +780,7 @@ intvec_\vecname: .macro dc_dispatch vecnum, vecname .org (\vecnum << 8) intvec_\vecname: - j hv_downcall_dispatch + j _hv_downcall_dispatch ENDPROC(intvec_\vecname) .endm @@ -605,6 +809,10 @@ handle_interrupt: * This routine takes a boolean in r30 indicating if this is an NMI. * If so, we also expect a boolean in r31 indicating whether to * re-enable the oprofile interrupts. + * + * Note that .Lresume_userspace is jumped to directly in several + * places, and we need to make sure r30 is set correctly in those + * callers as well. */ STD_ENTRY(interrupt_return) /* If we're resuming to kernel space, don't check thread flags. */ @@ -613,14 +821,39 @@ STD_ENTRY(interrupt_return) PTREGS_PTR(r29, PTREGS_OFFSET_EX1) } ld r29, r29 - andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(r29, r29) { beqzt r29, .Lresume_userspace - PTREGS_PTR(r29, PTREGS_OFFSET_PC) + move r29, sp } +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption. */ + EXTRACT_THREAD_INFO(r29) + addli r28, r29, THREAD_INFO_FLAGS_OFFSET + { + ld r28, r28 + addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET + } + { + andi r28, r28, _TIF_NEED_RESCHED + ld4s r29, r29 + } + beqzt r28, 1f + bnez r29, 1f + /* Disable interrupts explicitly for preemption. */ + IRQ_DISABLE(r20,r21) + TRACE_IRQS_OFF + jal preempt_schedule_irq + FEEDBACK_REENTER(interrupt_return) +1: +#endif + /* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */ - moveli r27, hw2_last(_cpu_idle_nap) + { + moveli r27, hw2_last(_cpu_idle_nap) + PTREGS_PTR(r29, PTREGS_OFFSET_PC) + } { ld r28, r29 shl16insli r27, r27, hw1(_cpu_idle_nap) @@ -642,6 +875,20 @@ STD_ENTRY(interrupt_return) FEEDBACK_REENTER(interrupt_return) /* + * Use r33 to hold whether we have already loaded the callee-saves + * into ptregs. We don't want to do it twice in this loop, since + * then we'd clobber whatever changes are made by ptrace, etc. + */ + { + movei r33, 0 + move r32, sp + } + + /* Get base of stack in r32. */ + EXTRACT_THREAD_INFO(r32) + +.Lretry_work_pending: + /* * Disable interrupts so as to make sure we don't * miss an interrupt that sets any of the thread flags (like * need_resched or sigpending) between sampling and the iret. @@ -651,9 +898,6 @@ STD_ENTRY(interrupt_return) IRQ_DISABLE(r20, r21) TRACE_IRQS_OFF /* Note: clobbers registers r0-r29 */ - /* Get base of stack in r32; note r30/31 are used as arguments here. */ - GET_THREAD_INFO(r32) - /* Check to see if there is any work to do before returning to user. */ { @@ -669,16 +913,18 @@ STD_ENTRY(interrupt_return) /* * Make sure we have all the registers saved for signal - * handling or single-step. Call out to C code to figure out + * handling or notify-resume. Call out to C code to figure out * exactly what we need to do for each flag bit, then if * necessary, reload the flags and recheck. */ - push_extra_callee_saves r0 { PTREGS_PTR(r0, PTREGS_OFFSET_BASE) - jal do_work_pending + bnez r33, 1f } - bnez r0, .Lresume_userspace + push_extra_callee_saves r0 + movei r33, 1 +1: jal do_work_pending + bnez r0, .Lretry_work_pending /* * In the NMI case we @@ -702,7 +948,7 @@ STD_ENTRY(interrupt_return) PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS) } { - andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK + IS_KERNEL_EX1(r0, r0) ld r32, r32 } bnez r0, 1f @@ -718,12 +964,22 @@ STD_ENTRY(interrupt_return) beqzt r30, .Lrestore_regs j 3f 2: TRACE_IRQS_ON + IRQ_ENABLE_LOAD(r20, r21) movei r0, 1 mtspr INTERRUPT_CRITICAL_SECTION, r0 - IRQ_ENABLE(r20, r21) + IRQ_ENABLE_APPLY(r20, r21) beqzt r30, .Lrestore_regs 3: +#if INT_PERF_COUNT + 1 != INT_AUX_PERF_COUNT +# error Bad interrupt assumption +#endif + { + movei r0, 3 /* two adjacent bits for the PERF_COUNT mask */ + beqz r31, .Lrestore_regs + } + shli r0, r0, INT_PERF_COUNT + mtspr SPR_INTERRUPT_MASK_RESET_K, r0 /* * We now commit to returning from this interrupt, since we will be @@ -737,7 +993,6 @@ STD_ENTRY(interrupt_return) * that will save some cycles if this turns out to be a syscall. */ .Lrestore_regs: - FEEDBACK_REENTER(interrupt_return) /* called from elsewhere */ /* * Rotate so we have one high bit and one low bit to test. @@ -773,7 +1028,7 @@ STD_ENTRY(interrupt_return) pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC { mtspr SPR_EX_CONTEXT_K_1, lr - andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(lr, lr) } { mtspr SPR_EX_CONTEXT_K_0, r21 @@ -941,7 +1196,7 @@ handle_nmi: FEEDBACK_REENTER(handle_nmi) { movei r30, 1 - move r31, r0 + cmpeq r31, r0, zero } j interrupt_return STD_ENDPROC(handle_nmi) @@ -963,19 +1218,30 @@ handle_syscall: shl16insli r20, r20, hw0(irq_stat + IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET) add r20, r20, tp ld4s r21, r20 - addi r21, r21, 1 - st4 r20, r21 + { + addi r21, r21, 1 + move r31, sp + } + { + st4 r20, r21 + EXTRACT_THREAD_INFO(r31) + } /* Trace syscalls, if requested. */ - GET_THREAD_INFO(r31) addi r31, r31, THREAD_INFO_FLAGS_OFFSET - ld r30, r31 - andi r30, r30, _TIF_SYSCALL_TRACE + { + ld r30, r31 + moveli r32, _TIF_SYSCALL_ENTRY_WORK + } + and r30, r30, r32 { addi r30, r31, THREAD_INFO_STATUS_OFFSET - THREAD_INFO_FLAGS_OFFSET beqzt r30, .Lrestore_syscall_regs } - jal do_syscall_trace + { + PTREGS_PTR(r0, PTREGS_OFFSET_BASE) + jal do_syscall_trace_enter + } FEEDBACK_REENTER(handle_syscall) /* @@ -1004,7 +1270,9 @@ handle_syscall: /* Ensure that the syscall number is within the legal range. */ { moveli r20, hw2(sys_call_table) +#ifdef CONFIG_COMPAT blbs r30, .Lcompat_syscall +#endif } { cmpltu r21, TREG_SYSCALL_NR_NAME, r21 @@ -1038,13 +1306,37 @@ handle_syscall: FEEDBACK_REENTER(handle_syscall) /* Do syscall trace again, if requested. */ - ld r30, r31 - andi r30, r30, _TIF_SYSCALL_TRACE - beqzt r30, 1f - jal do_syscall_trace + { + ld r30, r31 + moveli r32, _TIF_SYSCALL_EXIT_WORK + } + and r0, r30, r32 + { + andi r0, r30, _TIF_SINGLESTEP + beqzt r0, 1f + } + { + PTREGS_PTR(r0, PTREGS_OFFSET_BASE) + jal do_syscall_trace_exit + } FEEDBACK_REENTER(handle_syscall) -1: j .Lresume_userspace /* jump into middle of interrupt_return */ + andi r0, r30, _TIF_SINGLESTEP + +1: beqzt r0, 2f + + /* Single stepping -- notify ptrace. */ + { + movei r0, SIGTRAP + jal ptrace_notify + } + FEEDBACK_REENTER(handle_syscall) + +2: { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } +#ifdef CONFIG_COMPAT .Lcompat_syscall: /* * Load the base of the compat syscall table in r20, and @@ -1069,6 +1361,7 @@ handle_syscall: { move r15, r4; addxi r4, r4, 0 } { move r16, r5; addxi r5, r5, 0 } j .Lload_syscall_pointer +#endif .Linvalid_syscall: /* Report an invalid syscall back to the user program */ @@ -1077,7 +1370,10 @@ handle_syscall: movei r28, -ENOSYS } st r29, r28 - j .Lresume_userspace /* jump into middle of interrupt_return */ + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } STD_ENDPROC(handle_syscall) /* Return the address for oprofile to suppress in backtraces. */ @@ -1093,9 +1389,27 @@ STD_ENTRY(ret_from_fork) jal sim_notify_fork jal schedule_tail FEEDBACK_REENTER(ret_from_fork) - j .Lresume_userspace + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } STD_ENDPROC(ret_from_fork) +STD_ENTRY(ret_from_kernel_thread) + jal sim_notify_fork + jal schedule_tail + FEEDBACK_REENTER(ret_from_fork) + { + move r0, r31 + jalr r30 + } + FEEDBACK_REENTER(ret_from_kernel_thread) + { + movei r30, 0 /* not an NMI */ + j .Lresume_userspace /* jump into middle of interrupt_return */ + } + STD_ENDPROC(ret_from_kernel_thread) + /* Various stub interrupt handlers and syscall handlers */ STD_ENTRY_LOCAL(_kernel_double_fault) @@ -1112,15 +1426,6 @@ STD_ENTRY_LOCAL(bad_intr) panic "Unhandled interrupt %#x: PC %#lx" STD_ENDPROC(bad_intr) -/* Put address of pt_regs in reg and jump. */ -#define PTREGS_SYSCALL(x, reg) \ - STD_ENTRY(_##x); \ - { \ - PTREGS_PTR(reg, PTREGS_OFFSET_BASE); \ - j x \ - }; \ - STD_ENDPROC(_##x) - /* * Special-case sigreturn to not write r0 to the stack on return. * This is technically more efficient, but it also avoids difficulties @@ -1136,37 +1441,74 @@ STD_ENTRY_LOCAL(bad_intr) }; \ STD_ENDPROC(_##x) -PTREGS_SYSCALL(sys_execve, r3) -PTREGS_SYSCALL(sys_sigaltstack, r2) PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0) #ifdef CONFIG_COMPAT -PTREGS_SYSCALL(compat_sys_execve, r3) -PTREGS_SYSCALL(compat_sys_sigaltstack, r2) PTREGS_SYSCALL_SIGRETURN(compat_sys_rt_sigreturn, r0) #endif -/* Save additional callee-saves to pt_regs, put address in r4 and jump. */ +/* Save additional callee-saves to pt_regs and jump to standard function. */ STD_ENTRY(_sys_clone) push_extra_callee_saves r4 j sys_clone STD_ENDPROC(_sys_clone) -/* The single-step support may need to read all the registers. */ + /* + * Recover r3, r2, r1 and r0 here saved by unalign fast vector. + * The vector area limit is 32 bundles, so we handle the reload here. + * r0, r1, r2 are in thread_info from low to high memory in order. + * r3 points to location the original r3 was saved. + * We put this code in the __HEAD section so it can be reached + * via a conditional branch from the fast path. + */ + __HEAD +hand_unalign_slow: + andi sp, sp, ~1 +hand_unalign_slow_badsp: + addi r3, r3, -(3 * 8) + ld_add r0, r3, 8 + ld_add r1, r3, 8 + ld r2, r3 +hand_unalign_slow_nonuser: + mfspr r3, SPR_SYSTEM_SAVE_K_1 + __int_hand INT_UNALIGN_DATA, UNALIGN_DATA_SLOW, int_unalign + +/* The unaligned data support needs to read all the registers. */ int_unalign: push_extra_callee_saves r0 - j do_trap + j do_unaligned +ENDPROC(hand_unalign_slow) -/* Include .intrpt1 array of interrupt vectors */ - .section ".intrpt1", "ax" +/* Fill the return address stack with nonzero entries. */ +STD_ENTRY(fill_ra_stack) + { + move r0, lr + jal 1f + } +1: jal 2f +2: jal 3f +3: jal 4f +4: jrp r0 + STD_ENDPROC(fill_ra_stack) -#define op_handle_perf_interrupt bad_intr -#define op_handle_aux_perf_interrupt bad_intr + .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt + .org (\vecnum << 8) + __int_hand \vecnum, \vecname, \c_routine, \processing + .endm + +/* Include .intrpt array of interrupt vectors */ + .section ".intrpt", "ax" + .global intrpt_start +intrpt_start: + +#ifndef CONFIG_USE_PMC +#define handle_perf_interrupt bad_intr +#endif #ifndef CONFIG_HARDWALL #define do_hardwall_trap bad_intr #endif - int_hand INT_MEM_ERROR, MEM_ERROR, bad_intr + int_hand INT_MEM_ERROR, MEM_ERROR, do_trap int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr #if CONFIG_KERNEL_PL == 2 int_hand INT_SINGLE_STEP_2, SINGLE_STEP_2, gx_singlestep_handle @@ -1188,10 +1530,10 @@ int_unalign: int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall int_hand INT_SWINT_0, SWINT_0, do_trap int_hand INT_ILL_TRANS, ILL_TRANS, do_trap - int_hand INT_UNALIGN_DATA, UNALIGN_DATA, int_unalign + int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault - int_hand INT_IDN_FIREWALL, IDN_FIREWALL, bad_intr + int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr @@ -1208,9 +1550,9 @@ int_unalign: #endif int_hand INT_IPI_0, IPI_0, bad_intr int_hand INT_PERF_COUNT, PERF_COUNT, \ - op_handle_perf_interrupt, handle_nmi + handle_perf_interrupt, handle_nmi int_hand INT_AUX_PERF_COUNT, AUX_PERF_COUNT, \ - op_handle_perf_interrupt, handle_nmi + handle_perf_interrupt, handle_nmi int_hand INT_INTCTRL_3, INTCTRL_3, bad_intr #if CONFIG_KERNEL_PL == 2 dc_dispatch INT_INTCTRL_2, INTCTRL_2 diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c index 02e62806501..637f2ffaa5f 100644 --- a/arch/tile/kernel/irq.c +++ b/arch/tile/kernel/irq.c @@ -21,6 +21,7 @@ #include <hv/drv_pcie_rc_intf.h> #include <arch/spr_def.h> #include <asm/traps.h> +#include <linux/perf_event.h> /* Bit-flag stored in irq_desc->chip_data to indicate HW-cleared irqs. */ #define IS_HW_CLEARED 1 @@ -53,12 +54,6 @@ static DEFINE_PER_CPU(unsigned long, irq_disable_mask) */ static DEFINE_PER_CPU(int, irq_depth); -/* State for allocating IRQs on Gx. */ -#if CHIP_HAS_IPI() -static unsigned long available_irqs = ~(1UL << IRQ_RESCHEDULE); -static DEFINE_SPINLOCK(available_irqs_lock); -#endif - #if CHIP_HAS_IPI() /* Use SPRs to manipulate device interrupts. */ #define mask_irqs(irq_mask) __insn_mtspr(SPR_IPI_MASK_SET_K, irq_mask) @@ -73,7 +68,8 @@ static DEFINE_SPINLOCK(available_irqs_lock); /* * The interrupt handling path, implemented in terms of HV interrupt - * emulation on TILE64 and TILEPro, and IPI hardware on TILE-Gx. + * emulation on TILEPro, and IPI hardware on TILE-Gx. + * Entered with interrupts disabled. */ void tile_dev_intr(struct pt_regs *regs, int intnum) { @@ -220,7 +216,7 @@ void __init init_IRQ(void) ipi_init(); } -void __cpuinit setup_irq_regs(void) +void setup_irq_regs(void) { /* Enable interrupt delivery. */ unmask_irqs(~0UL); @@ -233,7 +229,7 @@ void tile_irq_activate(unsigned int irq, int tile_irq_type) { /* * We use handle_level_irq() by default because the pending - * interrupt vector (whether modeled by the HV on TILE64 and + * interrupt vector (whether modeled by the HV on * TILEPro or implemented in hardware on TILE-Gx) has * level-style semantics for each bit. An interrupt fires * whenever a bit is high, not just at edges. @@ -259,37 +255,27 @@ void ack_bad_irq(unsigned int irq) } /* - * Generic, controller-independent functions: + * /proc/interrupts printing: */ - -#if CHIP_HAS_IPI() -int create_irq(void) +int arch_show_interrupts(struct seq_file *p, int prec) { - unsigned long flags; - int result; - - spin_lock_irqsave(&available_irqs_lock, flags); - if (available_irqs == 0) - result = -ENOMEM; - else { - result = __ffs(available_irqs); - available_irqs &= ~(1UL << result); - dynamic_irq_init(result); - } - spin_unlock_irqrestore(&available_irqs_lock, flags); +#ifdef CONFIG_PERF_EVENTS + int i; - return result; + seq_printf(p, "%*s: ", prec, "PMI"); + + for_each_online_cpu(i) + seq_printf(p, "%10llu ", per_cpu(perf_irqs, i)); + seq_puts(p, " perf_events\n"); +#endif + return 0; } -EXPORT_SYMBOL(create_irq); -void destroy_irq(unsigned int irq) +#if CHIP_HAS_IPI() +int arch_setup_hwirq(unsigned int irq, int node) { - unsigned long flags; - - spin_lock_irqsave(&available_irqs_lock, flags); - available_irqs |= (1UL << irq); - dynamic_irq_cleanup(irq); - spin_unlock_irqrestore(&available_irqs_lock, flags); + return irq >= NR_IRQS ? -EINVAL : 0; } -EXPORT_SYMBOL(destroy_irq); + +void arch_teardown_hwirq(unsigned int irq) { } #endif diff --git a/arch/tile/kernel/kgdb.c b/arch/tile/kernel/kgdb.c new file mode 100644 index 00000000000..4cd88381a83 --- /dev/null +++ b/arch/tile/kernel/kgdb.c @@ -0,0 +1,499 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * TILE-Gx KGDB support. + */ + +#include <linux/ptrace.h> +#include <linux/kgdb.h> +#include <linux/kdebug.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <asm/cacheflush.h> + +static tile_bundle_bits singlestep_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP; +static unsigned long stepped_addr; +static tile_bundle_bits stepped_instr; + +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { + { "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[0])}, + { "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[1])}, + { "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[2])}, + { "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[3])}, + { "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[4])}, + { "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[5])}, + { "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[6])}, + { "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[7])}, + { "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[8])}, + { "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[9])}, + { "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[10])}, + { "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[11])}, + { "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[12])}, + { "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[13])}, + { "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[14])}, + { "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[15])}, + { "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[16])}, + { "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[17])}, + { "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[18])}, + { "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[19])}, + { "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[20])}, + { "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[21])}, + { "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[22])}, + { "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[23])}, + { "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[24])}, + { "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[25])}, + { "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[26])}, + { "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[27])}, + { "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[28])}, + { "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[29])}, + { "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[30])}, + { "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[31])}, + { "r32", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[32])}, + { "r33", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[33])}, + { "r34", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[34])}, + { "r35", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[35])}, + { "r36", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[36])}, + { "r37", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[37])}, + { "r38", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[38])}, + { "r39", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[39])}, + { "r40", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[40])}, + { "r41", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[41])}, + { "r42", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[42])}, + { "r43", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[43])}, + { "r44", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[44])}, + { "r45", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[45])}, + { "r46", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[46])}, + { "r47", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[47])}, + { "r48", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[48])}, + { "r49", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[49])}, + { "r50", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[50])}, + { "r51", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[51])}, + { "r52", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[52])}, + { "tp", GDB_SIZEOF_REG, offsetof(struct pt_regs, tp)}, + { "sp", GDB_SIZEOF_REG, offsetof(struct pt_regs, sp)}, + { "lr", GDB_SIZEOF_REG, offsetof(struct pt_regs, lr)}, + { "sn", GDB_SIZEOF_REG, -1}, + { "idn0", GDB_SIZEOF_REG, -1}, + { "idn1", GDB_SIZEOF_REG, -1}, + { "udn0", GDB_SIZEOF_REG, -1}, + { "udn1", GDB_SIZEOF_REG, -1}, + { "udn2", GDB_SIZEOF_REG, -1}, + { "udn3", GDB_SIZEOF_REG, -1}, + { "zero", GDB_SIZEOF_REG, -1}, + { "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, pc)}, + { "faultnum", GDB_SIZEOF_REG, offsetof(struct pt_regs, faultnum)}, +}; + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + else + memset(mem, 0, dbg_reg_def[regno].size); + return dbg_reg_def[regno].name; +} + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return -EINVAL; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +/* + * Similar to pt_regs_to_gdb_regs() except that process is sleeping and so + * we may not be able to get all the info. + */ +void +sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task) +{ + int reg; + struct pt_regs *thread_regs; + unsigned long *ptr = gdb_regs; + + if (task == NULL) + return; + + /* Initialize to zero. */ + memset(gdb_regs, 0, NUMREGBYTES); + + thread_regs = task_pt_regs(task); + for (reg = 0; reg <= TREG_LAST_GPR; reg++) + *(ptr++) = thread_regs->regs[reg]; + + gdb_regs[TILEGX_PC_REGNUM] = thread_regs->pc; + gdb_regs[TILEGX_FAULTNUM_REGNUM] = thread_regs->faultnum; +} + +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) +{ + regs->pc = pc; +} + +static void kgdb_call_nmi_hook(void *ignored) +{ + kgdb_nmicallback(raw_smp_processor_id(), NULL); +} + +void kgdb_roundup_cpus(unsigned long flags) +{ + local_irq_enable(); + smp_call_function(kgdb_call_nmi_hook, NULL, 0); + local_irq_disable(); +} + +/* + * Convert a kernel address to the writable kernel text mapping. + */ +static unsigned long writable_address(unsigned long addr) +{ + unsigned long ret = 0; + + if (core_kernel_text(addr)) + ret = addr - MEM_SV_START + PAGE_OFFSET; + else if (is_module_text_address(addr)) + ret = addr; + else + pr_err("Unknown virtual address 0x%lx\n", addr); + + return ret; +} + +/* + * Calculate the new address for after a step. + */ +static unsigned long get_step_address(struct pt_regs *regs) +{ + int src_reg; + int jump_off; + int br_off; + unsigned long addr; + unsigned int opcode; + tile_bundle_bits bundle; + + /* Move to the next instruction by default. */ + addr = regs->pc + TILEGX_BUNDLE_SIZE_IN_BYTES; + bundle = *(unsigned long *)instruction_pointer(regs); + + /* 0: X mode, Otherwise: Y mode. */ + if (bundle & TILEGX_BUNDLE_MODE_MASK) { + if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 && + get_RRROpcodeExtension_Y1(bundle) == + UNARY_RRR_1_OPCODE_Y1) { + opcode = get_UnaryOpcodeExtension_Y1(bundle); + + switch (opcode) { + case JALR_UNARY_OPCODE_Y1: + case JALRP_UNARY_OPCODE_Y1: + case JR_UNARY_OPCODE_Y1: + case JRP_UNARY_OPCODE_Y1: + src_reg = get_SrcA_Y1(bundle); + dbg_get_reg(src_reg, &addr, regs); + break; + } + } + } else if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) { + if (get_RRROpcodeExtension_X1(bundle) == + UNARY_RRR_0_OPCODE_X1) { + opcode = get_UnaryOpcodeExtension_X1(bundle); + + switch (opcode) { + case JALR_UNARY_OPCODE_X1: + case JALRP_UNARY_OPCODE_X1: + case JR_UNARY_OPCODE_X1: + case JRP_UNARY_OPCODE_X1: + src_reg = get_SrcA_X1(bundle); + dbg_get_reg(src_reg, &addr, regs); + break; + } + } + } else if (get_Opcode_X1(bundle) == JUMP_OPCODE_X1) { + opcode = get_JumpOpcodeExtension_X1(bundle); + + switch (opcode) { + case JAL_JUMP_OPCODE_X1: + case J_JUMP_OPCODE_X1: + jump_off = sign_extend(get_JumpOff_X1(bundle), 27); + addr = regs->pc + + (jump_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES); + break; + } + } else if (get_Opcode_X1(bundle) == BRANCH_OPCODE_X1) { + br_off = 0; + opcode = get_BrType_X1(bundle); + + switch (opcode) { + case BEQZT_BRANCH_OPCODE_X1: + case BEQZ_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) == 0) + br_off = get_BrOff_X1(bundle); + break; + case BGEZT_BRANCH_OPCODE_X1: + case BGEZ_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) >= 0) + br_off = get_BrOff_X1(bundle); + break; + case BGTZT_BRANCH_OPCODE_X1: + case BGTZ_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) > 0) + br_off = get_BrOff_X1(bundle); + break; + case BLBCT_BRANCH_OPCODE_X1: + case BLBC_BRANCH_OPCODE_X1: + if (!(get_SrcA_X1(bundle) & 1)) + br_off = get_BrOff_X1(bundle); + break; + case BLBST_BRANCH_OPCODE_X1: + case BLBS_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) & 1) + br_off = get_BrOff_X1(bundle); + break; + case BLEZT_BRANCH_OPCODE_X1: + case BLEZ_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) <= 0) + br_off = get_BrOff_X1(bundle); + break; + case BLTZT_BRANCH_OPCODE_X1: + case BLTZ_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) < 0) + br_off = get_BrOff_X1(bundle); + break; + case BNEZT_BRANCH_OPCODE_X1: + case BNEZ_BRANCH_OPCODE_X1: + if (get_SrcA_X1(bundle) != 0) + br_off = get_BrOff_X1(bundle); + break; + } + + if (br_off != 0) { + br_off = sign_extend(br_off, 17); + addr = regs->pc + + (br_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES); + } + } + + return addr; +} + +/* + * Replace the next instruction after the current instruction with a + * breakpoint instruction. + */ +static void do_single_step(struct pt_regs *regs) +{ + unsigned long addr_wr; + + /* Determine where the target instruction will send us to. */ + stepped_addr = get_step_address(regs); + probe_kernel_read((char *)&stepped_instr, (char *)stepped_addr, + BREAK_INSTR_SIZE); + + addr_wr = writable_address(stepped_addr); + probe_kernel_write((char *)addr_wr, (char *)&singlestep_insn, + BREAK_INSTR_SIZE); + smp_wmb(); + flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE); +} + +static void undo_single_step(struct pt_regs *regs) +{ + unsigned long addr_wr; + + if (stepped_instr == 0) + return; + + addr_wr = writable_address(stepped_addr); + probe_kernel_write((char *)addr_wr, (char *)&stepped_instr, + BREAK_INSTR_SIZE); + stepped_instr = 0; + smp_wmb(); + flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE); +} + +/* + * Calls linux_debug_hook before the kernel dies. If KGDB is enabled, + * then try to fall into the debugger. + */ +static int +kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + int ret; + unsigned long flags; + struct die_args *args = (struct die_args *)ptr; + struct pt_regs *regs = args->regs; + +#ifdef CONFIG_KPROBES + /* + * Return immediately if the kprobes fault notifier has set + * DIE_PAGE_FAULT. + */ + if (cmd == DIE_PAGE_FAULT) + return NOTIFY_DONE; +#endif /* CONFIG_KPROBES */ + + switch (cmd) { + case DIE_BREAK: + case DIE_COMPILED_BPT: + break; + case DIE_SSTEPBP: + local_irq_save(flags); + kgdb_handle_exception(0, SIGTRAP, 0, regs); + local_irq_restore(flags); + return NOTIFY_STOP; + default: + /* Userspace events, ignore. */ + if (user_mode(regs)) + return NOTIFY_DONE; + } + + local_irq_save(flags); + ret = kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); + local_irq_restore(flags); + if (ret) + return NOTIFY_DONE; + + return NOTIFY_STOP; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, +}; + +/* + * kgdb_arch_handle_exception - Handle architecture specific GDB packets. + * @vector: The error vector of the exception that happened. + * @signo: The signal number of the exception that happened. + * @err_code: The error code of the exception that happened. + * @remcom_in_buffer: The buffer of the packet we have read. + * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. + * @regs: The &struct pt_regs of the current process. + * + * This function MUST handle the 'c' and 's' command packets, + * as well packets to set / remove a hardware breakpoint, if used. + * If there are additional packets which the hardware needs to handle, + * they are handled here. The code should return -1 if it wants to + * process more packets, and a %0 or %1 if it wants to exit from the + * kgdb callback. + */ +int kgdb_arch_handle_exception(int vector, int signo, int err_code, + char *remcom_in_buffer, char *remcom_out_buffer, + struct pt_regs *regs) +{ + char *ptr; + unsigned long address; + + /* Undo any stepping we may have done. */ + undo_single_step(regs); + + switch (remcom_in_buffer[0]) { + case 'c': + case 's': + case 'D': + case 'k': + /* + * Try to read optional parameter, pc unchanged if no parm. + * If this was a compiled-in breakpoint, we need to move + * to the next instruction or we will just breakpoint + * over and over again. + */ + ptr = &remcom_in_buffer[1]; + if (kgdb_hex2long(&ptr, &address)) + regs->pc = address; + else if (*(unsigned long *)regs->pc == compiled_bpt) + regs->pc += BREAK_INSTR_SIZE; + + if (remcom_in_buffer[0] == 's') { + do_single_step(regs); + kgdb_single_step = 1; + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); + } else + atomic_set(&kgdb_cpu_doing_single_step, -1); + + return 0; + } + + return -1; /* this means that we do not want to exit from the handler */ +} + +struct kgdb_arch arch_kgdb_ops; + +/* + * kgdb_arch_init - Perform any architecture specific initalization. + * + * This function will handle the initalization of any architecture + * specific callbacks. + */ +int kgdb_arch_init(void) +{ + tile_bundle_bits bundle = TILEGX_BPT_BUNDLE; + + memcpy(arch_kgdb_ops.gdb_bpt_instr, &bundle, BREAK_INSTR_SIZE); + return register_die_notifier(&kgdb_notifier); +} + +/* + * kgdb_arch_exit - Perform any architecture specific uninitalization. + * + * This function will handle the uninitalization of any architecture + * specific callbacks, for dynamic registration and unregistration. + */ +void kgdb_arch_exit(void) +{ + unregister_die_notifier(&kgdb_notifier); +} + +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned long addr_wr = writable_address(bpt->bpt_addr); + + if (addr_wr == 0) + return -1; + + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + + err = probe_kernel_write((char *)addr_wr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); + smp_wmb(); + flush_icache_range((unsigned long)bpt->bpt_addr, + (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE); + return err; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned long addr_wr = writable_address(bpt->bpt_addr); + + if (addr_wr == 0) + return -1; + + err = probe_kernel_write((char *)addr_wr, (char *)bpt->saved_instr, + BREAK_INSTR_SIZE); + smp_wmb(); + flush_icache_range((unsigned long)bpt->bpt_addr, + (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE); + return err; +} diff --git a/arch/tile/kernel/kprobes.c b/arch/tile/kernel/kprobes.c new file mode 100644 index 00000000000..27cdcacbe81 --- /dev/null +++ b/arch/tile/kernel/kprobes.c @@ -0,0 +1,528 @@ +/* + * arch/tile/kernel/kprobes.c + * Kprobes on TILE-Gx + * + * Some portions copied from the MIPS version. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright 2006 Sony Corp. + * Copyright 2010 Cavium Networks + * + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> + +#include <arch/opcode.h> + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +tile_bundle_bits breakpoint_insn = TILEGX_BPT_BUNDLE; +tile_bundle_bits breakpoint2_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP; + +/* + * Check whether instruction is branch or jump, or if executing it + * has different results depending on where it is executed (e.g. lnk). + */ +static int __kprobes insn_has_control(kprobe_opcode_t insn) +{ + if (get_Mode(insn) != 0) { /* Y-format bundle */ + if (get_Opcode_Y1(insn) != RRR_1_OPCODE_Y1 || + get_RRROpcodeExtension_Y1(insn) != UNARY_RRR_1_OPCODE_Y1) + return 0; + + switch (get_UnaryOpcodeExtension_Y1(insn)) { + case JALRP_UNARY_OPCODE_Y1: + case JALR_UNARY_OPCODE_Y1: + case JRP_UNARY_OPCODE_Y1: + case JR_UNARY_OPCODE_Y1: + case LNK_UNARY_OPCODE_Y1: + return 1; + default: + return 0; + } + } + + switch (get_Opcode_X1(insn)) { + case BRANCH_OPCODE_X1: /* branch instructions */ + case JUMP_OPCODE_X1: /* jump instructions: j and jal */ + return 1; + + case RRR_0_OPCODE_X1: /* other jump instructions */ + if (get_RRROpcodeExtension_X1(insn) != UNARY_RRR_0_OPCODE_X1) + return 0; + switch (get_UnaryOpcodeExtension_X1(insn)) { + case JALRP_UNARY_OPCODE_X1: + case JALR_UNARY_OPCODE_X1: + case JRP_UNARY_OPCODE_X1: + case JR_UNARY_OPCODE_X1: + case LNK_UNARY_OPCODE_X1: + return 1; + default: + return 0; + } + default: + return 0; + } +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + + if (addr & (sizeof(kprobe_opcode_t) - 1)) + return -EINVAL; + + if (insn_has_control(*p->addr)) { + pr_notice("Kprobes for control instructions are not " + "supported\n"); + return -EINVAL; + } + + /* insn: must be on special executable page on tile. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + + /* + * In the kprobe->ainsn.insn[] array we store the original + * instruction at index zero and a break trap instruction at + * index one. + */ + memcpy(&p->ainsn.insn[0], p->addr, sizeof(kprobe_opcode_t)); + p->ainsn.insn[1] = breakpoint2_insn; + p->opcode = *p->addr; + + return 0; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + unsigned long addr_wr; + + /* Operate on writable kernel text mapping. */ + addr_wr = (unsigned long)p->addr - MEM_SV_START + PAGE_OFFSET; + + if (probe_kernel_write((void *)addr_wr, &breakpoint_insn, + sizeof(breakpoint_insn))) + pr_err("%s: failed to enable kprobe\n", __func__); + + smp_wmb(); + flush_insn_slot(p); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *kp) +{ + unsigned long addr_wr; + + /* Operate on writable kernel text mapping. */ + addr_wr = (unsigned long)kp->addr - MEM_SV_START + PAGE_OFFSET; + + if (probe_kernel_write((void *)addr_wr, &kp->opcode, + sizeof(kp->opcode))) + pr_err("%s: failed to enable kprobe\n", __func__); + + smp_wmb(); + flush_insn_slot(kp); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.saved_pc = kcb->kprobe_saved_pc; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_saved_pc = kcb->prev_kprobe.saved_pc; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, p); + kcb->kprobe_saved_pc = regs->pc; +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + /* Single step inline if the instruction is a break. */ + if (p->opcode == breakpoint_insn || + p->opcode == breakpoint2_insn) + regs->pc = (unsigned long)p->addr; + else + regs->pc = (unsigned long)&p->ainsn.insn[0]; +} + +static int __kprobes kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr; + struct kprobe_ctlblk *kcb; + + addr = (kprobe_opcode_t *)regs->pc; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing. + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing. */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + if (kcb->kprobe_status == KPROBE_HIT_SS && + p->ainsn.insn[0] == breakpoint_insn) { + goto no_kprobe; + } + /* + * We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else { + if (*addr != breakpoint_insn) { + /* + * The breakpoint instruction was removed by + * another cpu right after we hit, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + goto no_kprobe; + } + p = __this_cpu_read(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) + goto ss_probe; + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != breakpoint_insn) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it. */ + goto no_kprobe; + } + + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + if (p->pre_handler && p->pre_handler(p, regs)) { + /* Handler has already set things up, so skip ss setup. */ + return 1; + } + +ss_probe: + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction that has been replaced by the breakpoint. To avoid the + * SMP problems that can occur when we temporarily put back the + * original opcode to single-step, we single-stepped a copy of the + * instruction. The address of this copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * breakpoint trap. + */ +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long orig_pc = kcb->kprobe_saved_pc; + regs->pc = orig_pc + 8; +} + +static inline int post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs, kcb); + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + return 1; +} + +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + if (kcb->kprobe_status & KPROBE_HIT_SS) { + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + resume_execution(cur, regs, kcb); + reset_current_kprobe(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + switch (val) { + case DIE_BREAK: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEPBP: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id(). */ + preempt_disable(); + + if (kprobe_running() + && kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + preempt_enable(); + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_sp = regs->sp; + + memcpy(kcb->jprobes_stack, (void *)kcb->jprobe_saved_sp, + MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp)); + + regs->pc = (unsigned long)(jp->entry); + + return 1; +} + +/* Defined in the inline asm below. */ +void jprobe_return_end(void); + +void __kprobes jprobe_return(void) +{ + asm volatile( + "bpt\n\t" + ".globl jprobe_return_end\n" + "jprobe_return_end:\n"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (regs->pc >= (unsigned long)jprobe_return && + regs->pc <= (unsigned long)jprobe_return_end) { + *regs = kcb->jprobe_saved_regs; + memcpy((void *)kcb->jprobe_saved_sp, kcb->jprobes_stack, + MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp)); + preempt_enable_no_resched(); + + return 1; + } + return 0; +} + +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe causes the + * handlers to fire + */ +static void __used kretprobe_trampoline_holder(void) +{ + asm volatile( + "nop\n\t" + ".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n\t" + "nop\n\t" + : : : "memory"); +} + +void kretprobe_trampoline(void); + +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + ri->ret_addr = (kprobe_opcode_t *) regs->lr; + + /* Replace the return addr with trampoline addr */ + regs->lr = (unsigned long)kretprobe_trampoline; +} + +/* + * Called when the probe at kretprobe trampoline is hit. + */ +static int __kprobes trampoline_probe_handler(struct kprobe *p, + struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address = (unsigned long)kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * a return probe installed on them, and/or more than one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) { + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + instruction_pointer(regs) = orig_ret_address; + + reset_current_kprobe(); + kretprobe_hash_unlock(current, &flags); + preempt_enable_no_resched(); + + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + if (p->addr == (kprobe_opcode_t *)kretprobe_trampoline) + return 1; + + return 0; +} + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *)kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + register_kprobe(&trampoline_p); + return 0; +} diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c index 6255f2eab11..f0b54a93471 100644 --- a/arch/tile/kernel/machine_kexec.c +++ b/arch/tile/kernel/machine_kexec.c @@ -31,6 +31,8 @@ #include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/checksum.h> +#include <asm/tlbflush.h> +#include <asm/homecache.h> #include <hv/hypervisor.h> @@ -222,11 +224,22 @@ struct page *kimage_alloc_pages_arch(gfp_t gfp_mask, unsigned int order) return alloc_pages_node(0, gfp_mask, order); } +/* + * Address range in which pa=va mapping is set in setup_quasi_va_is_pa(). + * For tilepro, PAGE_OFFSET is used since this is the largest possbile value + * for tilepro, while for tilegx, we limit it to entire middle level page + * table which we assume has been allocated and is undoubtedly large enough. + */ +#ifndef __tilegx__ +#define QUASI_VA_IS_PA_ADDR_RANGE PAGE_OFFSET +#else +#define QUASI_VA_IS_PA_ADDR_RANGE PGDIR_SIZE +#endif + static void setup_quasi_va_is_pa(void) { - HV_PTE *pgtable; HV_PTE pte; - int i; + unsigned long i; /* * Flush our TLB to prevent conflicts between the previous contents @@ -234,16 +247,22 @@ static void setup_quasi_va_is_pa(void) */ local_flush_tlb_all(); - /* setup VA is PA, at least up to PAGE_OFFSET */ - - pgtable = (HV_PTE *)current->mm->pgd; + /* + * setup VA is PA, at least up to QUASI_VA_IS_PA_ADDR_RANGE. + * Note here we assume that level-1 page table is defined by + * HPAGE_SIZE. + */ pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE); pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3); - - for (i = 0; i < pgd_index(PAGE_OFFSET); i++) { + for (i = 0; i < (QUASI_VA_IS_PA_ADDR_RANGE >> HPAGE_SHIFT); i++) { + unsigned long vaddr = i << HPAGE_SHIFT; + pgd_t *pgd = pgd_offset(current->mm, vaddr); + pud_t *pud = pud_offset(pgd, vaddr); + pte_t *ptep = (pte_t *) pmd_offset(pud, vaddr); unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT); + if (pfn_valid(pfn)) - __set_pte(&pgtable[i], pfn_pte(pfn, pte)); + __set_pte(ptep, pfn_pte(pfn, pte)); } } @@ -251,6 +270,7 @@ static void setup_quasi_va_is_pa(void) void machine_kexec(struct kimage *image) { void *reboot_code_buffer; + pte_t *ptep; void (*rnk)(unsigned long, void *, unsigned long) __noreturn; @@ -266,8 +286,10 @@ void machine_kexec(struct kimage *image) */ homecache_change_page_home(image->control_code_page, 0, smp_processor_id()); - reboot_code_buffer = vmap(&image->control_code_page, 1, 0, - __pgprot(_PAGE_KERNEL | _PAGE_EXECUTABLE)); + reboot_code_buffer = page_address(image->control_code_page); + BUG_ON(reboot_code_buffer == NULL); + ptep = virt_to_pte(NULL, (unsigned long)reboot_code_buffer); + __set_pte(ptep, pte_mkexec(*ptep)); memcpy(reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); __flush_icache_range( diff --git a/arch/tile/kernel/mcount_64.S b/arch/tile/kernel/mcount_64.S new file mode 100644 index 00000000000..70d7bb0c4d8 --- /dev/null +++ b/arch/tile/kernel/mcount_64.S @@ -0,0 +1,224 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * TILE-Gx specific __mcount support + */ + +#include <linux/linkage.h> +#include <asm/ftrace.h> + +#define REGSIZE 8 + + .text + .global __mcount + + .macro MCOUNT_SAVE_REGS + addli sp, sp, -REGSIZE + { + st sp, lr + addli r29, sp, - (12 * REGSIZE) + } + { + addli sp, sp, - (13 * REGSIZE) + st r29, sp + } + addli r29, r29, REGSIZE + { st r29, r0; addli r29, r29, REGSIZE } + { st r29, r1; addli r29, r29, REGSIZE } + { st r29, r2; addli r29, r29, REGSIZE } + { st r29, r3; addli r29, r29, REGSIZE } + { st r29, r4; addli r29, r29, REGSIZE } + { st r29, r5; addli r29, r29, REGSIZE } + { st r29, r6; addli r29, r29, REGSIZE } + { st r29, r7; addli r29, r29, REGSIZE } + { st r29, r8; addli r29, r29, REGSIZE } + { st r29, r9; addli r29, r29, REGSIZE } + { st r29, r10; addli r29, r29, REGSIZE } + .endm + + .macro MCOUNT_RESTORE_REGS + addli r29, sp, (2 * REGSIZE) + { ld r0, r29; addli r29, r29, REGSIZE } + { ld r1, r29; addli r29, r29, REGSIZE } + { ld r2, r29; addli r29, r29, REGSIZE } + { ld r3, r29; addli r29, r29, REGSIZE } + { ld r4, r29; addli r29, r29, REGSIZE } + { ld r5, r29; addli r29, r29, REGSIZE } + { ld r6, r29; addli r29, r29, REGSIZE } + { ld r7, r29; addli r29, r29, REGSIZE } + { ld r8, r29; addli r29, r29, REGSIZE } + { ld r9, r29; addli r29, r29, REGSIZE } + { ld r10, r29; addli lr, sp, (13 * REGSIZE) } + { ld lr, lr; addli sp, sp, (14 * REGSIZE) } + .endm + + .macro RETURN_BACK + { move r12, lr; move lr, r10 } + jrp r12 + .endm + +#ifdef CONFIG_DYNAMIC_FTRACE + + .align 64 +STD_ENTRY(__mcount) +__mcount: + j ftrace_stub +STD_ENDPROC(__mcount) + + .align 64 +STD_ENTRY(ftrace_caller) + moveli r11, hw2_last(function_trace_stop) + { shl16insli r11, r11, hw1(function_trace_stop); move r12, lr } + { shl16insli r11, r11, hw0(function_trace_stop); move lr, r10 } + ld r11, r11 + beqz r11, 1f + jrp r12 + +1: + { move r10, lr; move lr, r12 } + MCOUNT_SAVE_REGS + + /* arg1: self return address */ + /* arg2: parent's return address */ + { move r0, lr; move r1, r10 } + + .global ftrace_call +ftrace_call: + /* + * a placeholder for the call to a real tracing function, i.e. + * ftrace_trace_function() + */ + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .global ftrace_graph_call +ftrace_graph_call: + /* + * a placeholder for the call to a real tracing function, i.e. + * ftrace_graph_caller() + */ + nop +#endif + MCOUNT_RESTORE_REGS + .global ftrace_stub +ftrace_stub: + RETURN_BACK +STD_ENDPROC(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + + .align 64 +STD_ENTRY(__mcount) + moveli r11, hw2_last(function_trace_stop) + { shl16insli r11, r11, hw1(function_trace_stop); move r12, lr } + { shl16insli r11, r11, hw0(function_trace_stop); move lr, r10 } + ld r11, r11 + beqz r11, 1f + jrp r12 + +1: + { move r10, lr; move lr, r12 } + { + moveli r11, hw2_last(ftrace_trace_function) + moveli r13, hw2_last(ftrace_stub) + } + { + shl16insli r11, r11, hw1(ftrace_trace_function) + shl16insli r13, r13, hw1(ftrace_stub) + } + { + shl16insli r11, r11, hw0(ftrace_trace_function) + shl16insli r13, r13, hw0(ftrace_stub) + } + + ld r11, r11 + sub r14, r13, r11 + bnez r14, static_trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + moveli r15, hw2_last(ftrace_graph_return) + shl16insli r15, r15, hw1(ftrace_graph_return) + shl16insli r15, r15, hw0(ftrace_graph_return) + ld r15, r15 + sub r15, r15, r13 + bnez r15, ftrace_graph_caller + + { + moveli r16, hw2_last(ftrace_graph_entry) + moveli r17, hw2_last(ftrace_graph_entry_stub) + } + { + shl16insli r16, r16, hw1(ftrace_graph_entry) + shl16insli r17, r17, hw1(ftrace_graph_entry_stub) + } + { + shl16insli r16, r16, hw0(ftrace_graph_entry) + shl16insli r17, r17, hw0(ftrace_graph_entry_stub) + } + ld r16, r16 + sub r17, r16, r17 + bnez r17, ftrace_graph_caller + +#endif + RETURN_BACK + +static_trace: + MCOUNT_SAVE_REGS + + /* arg1: self return address */ + /* arg2: parent's return address */ + { move r0, lr; move r1, r10 } + + /* call ftrace_trace_function() */ + jalr r11 + + MCOUNT_RESTORE_REGS + + .global ftrace_stub +ftrace_stub: + RETURN_BACK +STD_ENDPROC(__mcount) + +#endif /* ! CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +STD_ENTRY(ftrace_graph_caller) +ftrace_graph_caller: +#ifndef CONFIG_DYNAMIC_FTRACE + MCOUNT_SAVE_REGS +#endif + + /* arg1: Get the location of the parent's return address */ + addi r0, sp, 12 * REGSIZE + /* arg2: Get self return address */ + move r1, lr + + jal prepare_ftrace_return + + MCOUNT_RESTORE_REGS + RETURN_BACK +STD_ENDPROC(ftrace_graph_caller) + + .global return_to_handler +return_to_handler: + MCOUNT_SAVE_REGS + + jal ftrace_return_to_handler + /* restore the real parent address */ + move r11, r0 + + MCOUNT_RESTORE_REGS + jr r11 + +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/tile/kernel/messaging.c b/arch/tile/kernel/messaging.c index 0858ee6b520..7867266f971 100644 --- a/arch/tile/kernel/messaging.c +++ b/arch/tile/kernel/messaging.c @@ -25,7 +25,7 @@ /* All messages are stored here */ static DEFINE_PER_CPU(HV_MsgState, msg_state); -void __cpuinit init_messaging(void) +void init_messaging(void) { /* Allocate storage for messages in kernel space */ HV_MsgState *state = &__get_cpu_var(msg_state); @@ -68,8 +68,8 @@ void hv_message_intr(struct pt_regs *regs, int intnum) #endif while (1) { - rmi = hv_receive_message(__get_cpu_var(msg_state), - (HV_VirtAddr) message, + HV_MsgState *state = this_cpu_ptr(&msg_state); + rmi = hv_receive_message(*state, (HV_VirtAddr) message, sizeof(message)); if (rmi.msglen == 0) break; diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index b90ab992567..4918d91bc3a 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -24,16 +24,6 @@ #include <asm/homecache.h> #include <arch/opcode.h> -#ifdef __tilegx__ -# define Elf_Rela Elf64_Rela -# define ELF_R_SYM ELF64_R_SYM -# define ELF_R_TYPE ELF64_R_TYPE -#else -# define Elf_Rela Elf32_Rela -# define ELF_R_SYM ELF32_R_SYM -# define ELF_R_TYPE ELF32_R_TYPE -#endif - #ifdef MODULE_DEBUG #define DEBUGP printk #else @@ -52,8 +42,6 @@ void *module_alloc(unsigned long size) int i = 0; int npages; - if (size == 0) - return NULL; npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; pages = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); if (pages == NULL) @@ -67,6 +55,8 @@ void *module_alloc(unsigned long size) area = __get_vm_area(size, VM_ALLOC, MEM_MODULE_START, MEM_MODULE_END); if (!area) goto error; + area->nr_pages = npages; + area->pages = pages; if (map_vm_area(area, prot_rwx, &pages)) { vunmap(area->addr); @@ -157,7 +147,17 @@ int apply_relocate_add(Elf_Shdr *sechdrs, switch (ELF_R_TYPE(rel[i].r_info)) { -#define MUNGE(func) (*location = ((*location & ~func(-1)) | func(value))) +#ifdef __LITTLE_ENDIAN +# define MUNGE(func) \ + (*location = ((*location & ~func(-1)) | func(value))) +#else +/* + * Instructions are always little-endian, so when we read them as data, + * we have to swap them around before and after modifying them. + */ +# define MUNGE(func) \ + (*location = swab64((swab64(*location) & ~func(-1)) | func(value))) +#endif #ifndef __tilegx__ case R_TILE_32: diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c index b3ed19f8779..09b58703ac2 100644 --- a/arch/tile/kernel/pci-dma.c +++ b/arch/tile/kernel/pci-dma.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/dma-mapping.h> +#include <linux/swiotlb.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <asm/tlbflush.h> @@ -22,16 +23,22 @@ /* Generic DMA mapping functions: */ /* - * Allocate what Linux calls "coherent" memory, which for us just - * means uncached. + * Allocate what Linux calls "coherent" memory. On TILEPro this is + * uncached memory; on TILE-Gx it is hash-for-home memory. */ -void *dma_alloc_coherent(struct device *dev, - size_t size, - dma_addr_t *dma_handle, - gfp_t gfp) +#ifdef __tilepro__ +#define PAGE_HOME_DMA PAGE_HOME_UNCACHED +#else +#define PAGE_HOME_DMA PAGE_HOME_HASH +#endif + +static void *tile_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) { - u64 dma_mask = dev->coherent_dma_mask ?: DMA_BIT_MASK(32); - int node = dev_to_node(dev); + u64 dma_mask = (dev && dev->coherent_dma_mask) ? + dev->coherent_dma_mask : DMA_BIT_MASK(32); + int node = dev ? dev_to_node(dev) : 0; int order = get_order(size); struct page *pg; dma_addr_t addr; @@ -39,39 +46,42 @@ void *dma_alloc_coherent(struct device *dev, gfp |= __GFP_ZERO; /* - * By forcing NUMA node 0 for 32-bit masks we ensure that the - * high 32 bits of the resulting PA will be zero. If the mask - * size is, e.g., 24, we may still not be able to guarantee a - * suitable memory address, in which case we will return NULL. - * But such devices are uncommon. + * If the mask specifies that the memory be in the first 4 GB, then + * we force the allocation to come from the DMA zone. We also + * force the node to 0 since that's the only node where the DMA + * zone isn't empty. If the mask size is smaller than 32 bits, we + * may still not be able to guarantee a suitable memory address, in + * which case we will return NULL. But such devices are uncommon. */ - if (dma_mask <= DMA_BIT_MASK(32)) + if (dma_mask <= DMA_BIT_MASK(32)) { + gfp |= GFP_DMA; node = 0; + } - pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_UNCACHED); + pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_DMA); if (pg == NULL) return NULL; addr = page_to_phys(pg); if (addr + size > dma_mask) { - homecache_free_pages(addr, order); + __homecache_free_pages(pg, order); return NULL; } *dma_handle = addr; + return page_address(pg); } -EXPORT_SYMBOL(dma_alloc_coherent); /* - * Free memory that was allocated with dma_alloc_coherent. + * Free memory that was allocated with tile_dma_alloc_coherent. */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) +static void tile_dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) { homecache_free_pages((unsigned long)vaddr, get_order(size)); } -EXPORT_SYMBOL(dma_free_coherent); /* * The map routines "map" the specified address range for DMA @@ -87,52 +97,285 @@ EXPORT_SYMBOL(dma_free_coherent); * can count on nothing having been touched. */ -/* Flush a PA range from cache page by page. */ -static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size) +/* Set up a single page for DMA access. */ +static void __dma_prep_page(struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction) +{ + /* + * Flush the page from cache if necessary. + * On tilegx, data is delivered to hash-for-home L3; on tilepro, + * data is delivered direct to memory. + * + * NOTE: If we were just doing DMA_TO_DEVICE we could optimize + * this to be a "flush" not a "finv" and keep some of the + * state in cache across the DMA operation, but it doesn't seem + * worth creating the necessary flush_buffer_xxx() infrastructure. + */ + int home = page_home(page); + switch (home) { + case PAGE_HOME_HASH: +#ifdef __tilegx__ + return; +#endif + break; + case PAGE_HOME_UNCACHED: +#ifdef __tilepro__ + return; +#endif + break; + case PAGE_HOME_IMMUTABLE: + /* Should be going to the device only. */ + BUG_ON(direction == DMA_FROM_DEVICE || + direction == DMA_BIDIRECTIONAL); + return; + case PAGE_HOME_INCOHERENT: + /* Incoherent anyway, so no need to work hard here. */ + return; + default: + BUG_ON(home < 0 || home >= NR_CPUS); + break; + } + homecache_finv_page(page); + +#ifdef DEBUG_ALIGNMENT + /* Warn if the region isn't cacheline aligned. */ + if (offset & (L2_CACHE_BYTES - 1) || (size & (L2_CACHE_BYTES - 1))) + pr_warn("Unaligned DMA to non-hfh memory: PA %#llx/%#lx\n", + PFN_PHYS(page_to_pfn(page)) + offset, size); +#endif +} + +/* Make the page ready to be read by the core. */ +static void __dma_complete_page(struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction) +{ +#ifdef __tilegx__ + switch (page_home(page)) { + case PAGE_HOME_HASH: + /* I/O device delivered data the way the cpu wanted it. */ + break; + case PAGE_HOME_INCOHERENT: + /* Incoherent anyway, so no need to work hard here. */ + break; + case PAGE_HOME_IMMUTABLE: + /* Extra read-only copies are not a problem. */ + break; + default: + /* Flush the bogus hash-for-home I/O entries to memory. */ + homecache_finv_map_page(page, PAGE_HOME_HASH); + break; + } +#endif +} + +static void __dma_prep_pa_range(dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) { struct page *page = pfn_to_page(PFN_DOWN(dma_addr)); - size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1)); + unsigned long offset = dma_addr & (PAGE_SIZE - 1); + size_t bytes = min(size, (size_t)(PAGE_SIZE - offset)); + + while (size != 0) { + __dma_prep_page(page, offset, bytes, direction); + size -= bytes; + ++page; + offset = 0; + bytes = min((size_t)PAGE_SIZE, size); + } +} - while ((ssize_t)size > 0) { - /* Flush the page. */ - homecache_flush_cache(page++, 0); +static void __dma_complete_pa_range(dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + struct page *page = pfn_to_page(PFN_DOWN(dma_addr)); + unsigned long offset = dma_addr & (PAGE_SIZE - 1); + size_t bytes = min(size, (size_t)(PAGE_SIZE - offset)); + + while (size != 0) { + __dma_complete_page(page, offset, bytes, direction); + size -= bytes; + ++page; + offset = 0; + bytes = min((size_t)PAGE_SIZE, size); + } +} + +static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(!valid_dma_direction(direction)); + + WARN_ON(nents == 0 || sglist->length == 0); - /* Figure out if we need to continue on the next page. */ - size -= bytesleft; - bytesleft = PAGE_SIZE; + for_each_sg(sglist, sg, nents, i) { + sg->dma_address = sg_phys(sg); + __dma_prep_pa_range(sg->dma_address, sg->length, direction); +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif } + + return nents; } -/* - * dma_map_single can be passed any memory address, and there appear - * to be no alignment constraints. - * - * There is a chance that the start of the buffer will share a cache - * line with some other data that has been touched in the meantime. - */ -dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) +static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { - dma_addr_t dma_addr = __pa(ptr); + struct scatterlist *sg; + int i; BUG_ON(!valid_dma_direction(direction)); - WARN_ON(size == 0); + for_each_sg(sglist, sg, nents, i) { + sg->dma_address = sg_phys(sg); + __dma_complete_pa_range(sg->dma_address, sg->length, + direction); + } +} + +static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + BUG_ON(!valid_dma_direction(direction)); + + BUG_ON(offset + size > PAGE_SIZE); + __dma_prep_page(page, offset, size, direction); + + return page_to_pa(page) + offset; +} + +static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + BUG_ON(!valid_dma_direction(direction)); + + __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), + dma_address & (PAGE_SIZE - 1), size, direction); +} + +static void tile_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, + size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); - __dma_map_pa_range(dma_addr, size); + __dma_complete_pa_range(dma_handle, size, direction); +} - return dma_addr; +static void tile_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + __dma_prep_pa_range(dma_handle, size, direction); } -EXPORT_SYMBOL(dma_map_single); -void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) +static void tile_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { + struct scatterlist *sg; + int i; + BUG_ON(!valid_dma_direction(direction)); + WARN_ON(nelems == 0 || sglist->length == 0); + + for_each_sg(sglist, sg, nelems, i) { + dma_sync_single_for_cpu(dev, sg->dma_address, + sg_dma_len(sg), direction); + } } -EXPORT_SYMBOL(dma_unmap_single); -int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) +static void tile_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) +{ + struct scatterlist *sg; + int i; + + BUG_ON(!valid_dma_direction(direction)); + WARN_ON(nelems == 0 || sglist->length == 0); + + for_each_sg(sglist, sg, nelems, i) { + dma_sync_single_for_device(dev, sg->dma_address, + sg_dma_len(sg), direction); + } +} + +static inline int +tile_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return 0; +} + +static inline int +tile_dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +static struct dma_map_ops tile_default_dma_map_ops = { + .alloc = tile_dma_alloc_coherent, + .free = tile_dma_free_coherent, + .map_page = tile_dma_map_page, + .unmap_page = tile_dma_unmap_page, + .map_sg = tile_dma_map_sg, + .unmap_sg = tile_dma_unmap_sg, + .sync_single_for_cpu = tile_dma_sync_single_for_cpu, + .sync_single_for_device = tile_dma_sync_single_for_device, + .sync_sg_for_cpu = tile_dma_sync_sg_for_cpu, + .sync_sg_for_device = tile_dma_sync_sg_for_device, + .mapping_error = tile_dma_mapping_error, + .dma_supported = tile_dma_supported +}; + +struct dma_map_ops *tile_dma_map_ops = &tile_default_dma_map_ops; +EXPORT_SYMBOL(tile_dma_map_ops); + +/* Generic PCI DMA mapping functions */ + +static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) +{ + int node = dev_to_node(dev); + int order = get_order(size); + struct page *pg; + dma_addr_t addr; + + gfp |= __GFP_ZERO; + + pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_DMA); + if (pg == NULL) + return NULL; + + addr = page_to_phys(pg); + + *dma_handle = addr + get_dma_offset(dev); + + return page_address(pg); +} + +/* + * Free memory that was allocated with tile_pci_dma_alloc_coherent. + */ +static void tile_pci_dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + homecache_free_pages((unsigned long)vaddr, get_order(size)); +} + +static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct scatterlist *sg; int i; @@ -143,73 +386,103 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, for_each_sg(sglist, sg, nents, i) { sg->dma_address = sg_phys(sg); - __dma_map_pa_range(sg->dma_address, sg->length); + __dma_prep_pa_range(sg->dma_address, sg->length, direction); + + sg->dma_address = sg->dma_address + get_dma_offset(dev); +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif } return nents; } -EXPORT_SYMBOL(dma_map_sg); -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) +static void tile_pci_dma_unmap_sg(struct device *dev, + struct scatterlist *sglist, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs) { + struct scatterlist *sg; + int i; + BUG_ON(!valid_dma_direction(direction)); + for_each_sg(sglist, sg, nents, i) { + sg->dma_address = sg_phys(sg); + __dma_complete_pa_range(sg->dma_address, sg->length, + direction); + } } -EXPORT_SYMBOL(dma_unmap_sg); -dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { BUG_ON(!valid_dma_direction(direction)); BUG_ON(offset + size > PAGE_SIZE); - homecache_flush_cache(page, 0); + __dma_prep_page(page, offset, size, direction); - return page_to_pa(page) + offset; + return page_to_pa(page) + offset + get_dma_offset(dev); } -EXPORT_SYMBOL(dma_map_page); -void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) +static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { BUG_ON(!valid_dma_direction(direction)); + + dma_address -= get_dma_offset(dev); + + __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), + dma_address & (PAGE_SIZE - 1), size, direction); } -EXPORT_SYMBOL(dma_unmap_page); -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) +static void tile_pci_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, + size_t size, + enum dma_data_direction direction) { BUG_ON(!valid_dma_direction(direction)); + + dma_handle -= get_dma_offset(dev); + + __dma_complete_pa_range(dma_handle, size, direction); } -EXPORT_SYMBOL(dma_sync_single_for_cpu); -void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) +static void tile_pci_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, + enum dma_data_direction + direction) { - unsigned long start = PFN_DOWN(dma_handle); - unsigned long end = PFN_DOWN(dma_handle + size - 1); - unsigned long i; + dma_handle -= get_dma_offset(dev); - BUG_ON(!valid_dma_direction(direction)); - for (i = start; i <= end; ++i) - homecache_flush_cache(pfn_to_page(i), 0); + __dma_prep_pa_range(dma_handle, size, direction); } -EXPORT_SYMBOL(dma_sync_single_for_device); -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +static void tile_pci_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sglist, + int nelems, + enum dma_data_direction direction) { + struct scatterlist *sg; + int i; + BUG_ON(!valid_dma_direction(direction)); - WARN_ON(nelems == 0 || sg[0].length == 0); + WARN_ON(nelems == 0 || sglist->length == 0); + + for_each_sg(sglist, sg, nelems, i) { + dma_sync_single_for_cpu(dev, sg->dma_address, + sg_dma_len(sg), direction); + } } -EXPORT_SYMBOL(dma_sync_sg_for_cpu); -/* - * Flush and invalidate cache for scatterlist. - */ -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) +static void tile_pci_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, + int nelems, + enum dma_data_direction direction) { struct scatterlist *sg; int i; @@ -222,31 +495,136 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, sg_dma_len(sg), direction); } } -EXPORT_SYMBOL(dma_sync_sg_for_device); -void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static inline int +tile_pci_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return 0; +} + +static inline int +tile_pci_dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +static struct dma_map_ops tile_pci_default_dma_map_ops = { + .alloc = tile_pci_dma_alloc_coherent, + .free = tile_pci_dma_free_coherent, + .map_page = tile_pci_dma_map_page, + .unmap_page = tile_pci_dma_unmap_page, + .map_sg = tile_pci_dma_map_sg, + .unmap_sg = tile_pci_dma_unmap_sg, + .sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu, + .sync_single_for_device = tile_pci_dma_sync_single_for_device, + .sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu, + .sync_sg_for_device = tile_pci_dma_sync_sg_for_device, + .mapping_error = tile_pci_dma_mapping_error, + .dma_supported = tile_pci_dma_supported +}; + +struct dma_map_ops *gx_pci_dma_map_ops = &tile_pci_default_dma_map_ops; +EXPORT_SYMBOL(gx_pci_dma_map_ops); + +/* PCI DMA mapping functions for legacy PCI devices */ + +#ifdef CONFIG_SWIOTLB +static void *tile_swiotlb_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) { - dma_sync_single_for_cpu(dev, dma_handle + offset, size, direction); + gfp |= GFP_DMA; + return swiotlb_alloc_coherent(dev, size, dma_handle, gfp); } -EXPORT_SYMBOL(dma_sync_single_range_for_cpu); -void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static void tile_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) { - dma_sync_single_for_device(dev, dma_handle + offset, size, direction); + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + +static struct dma_map_ops pci_swiotlb_dma_ops = { + .alloc = tile_swiotlb_alloc_coherent, + .free = tile_swiotlb_free_coherent, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .dma_supported = swiotlb_dma_supported, + .mapping_error = swiotlb_dma_mapping_error, +}; + +static struct dma_map_ops pci_hybrid_dma_ops = { + .alloc = tile_swiotlb_alloc_coherent, + .free = tile_swiotlb_free_coherent, + .map_page = tile_pci_dma_map_page, + .unmap_page = tile_pci_dma_unmap_page, + .map_sg = tile_pci_dma_map_sg, + .unmap_sg = tile_pci_dma_unmap_sg, + .sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu, + .sync_single_for_device = tile_pci_dma_sync_single_for_device, + .sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu, + .sync_sg_for_device = tile_pci_dma_sync_sg_for_device, + .mapping_error = tile_pci_dma_mapping_error, + .dma_supported = tile_pci_dma_supported +}; + +struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops; +struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops; +#else +struct dma_map_ops *gx_legacy_pci_dma_map_ops; +struct dma_map_ops *gx_hybrid_pci_dma_map_ops; +#endif +EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops); +EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops); + +#ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK +int dma_set_coherent_mask(struct device *dev, u64 mask) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + /* + * For PCI devices with 64-bit DMA addressing capability, promote + * the dma_ops to full capability for both streams and consistent + * memory access. For 32-bit capable devices, limit the consistent + * memory DMA range to max_direct_dma_addr. + */ + if (dma_ops == gx_pci_dma_map_ops || + dma_ops == gx_hybrid_pci_dma_map_ops || + dma_ops == gx_legacy_pci_dma_map_ops) { + if (mask == DMA_BIT_MASK(64)) + set_dma_ops(dev, gx_pci_dma_map_ops); + else if (mask > dev->archdata.max_direct_dma_addr) + mask = dev->archdata.max_direct_dma_addr; + } + + if (!dma_supported(dev, mask)) + return -EIO; + dev->coherent_dma_mask = mask; + return 0; } -EXPORT_SYMBOL(dma_sync_single_range_for_device); +EXPORT_SYMBOL(dma_set_coherent_mask); +#endif +#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK /* - * dma_alloc_noncoherent() returns non-cacheable memory, so there's no - * need to do any flushing here. + * The generic dma_get_required_mask() uses the highest physical address + * (max_pfn) to provide the hint to the PCI drivers regarding 32-bit or + * 64-bit DMA configuration. Since TILEGx has I/O TLB/MMU, allowing the + * DMAs to use the full 64-bit PCI address space and not limited by + * the physical memory space, we always let the PCI devices use + * 64-bit DMA if they have that capability, by returning the 64-bit + * DMA mask here. The device driver has the option to use 32-bit DMA if + * the device is not capable of 64-bit DMA. */ -void dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction direction) +u64 dma_get_required_mask(struct device *dev) { + return DMA_BIT_MASK(64); } -EXPORT_SYMBOL(dma_cache_sync); +EXPORT_SYMBOL_GPL(dma_get_required_mask); +#endif diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c index a1bb59eecc1..1f80a88c75a 100644 --- a/arch/tile/kernel/pci.c +++ b/arch/tile/kernel/pci.c @@ -20,7 +20,6 @@ #include <linux/capability.h> #include <linux/sched.h> #include <linux/errno.h> -#include <linux/bootmem.h> #include <linux/irq.h> #include <linux/io.h> #include <linux/uaccess.h> @@ -52,6 +51,8 @@ * */ +static int pci_probe = 1; + /* * This flag tells if the platform is TILEmpower that needs * special configuration for the PLX switch chip. @@ -81,7 +82,7 @@ EXPORT_SYMBOL(pcibios_align_resource); * controller_id is the controller number, config type is 0 or 1 for * config0 or config1 operations. */ -static int __devinit tile_pcie_open(int controller_id, int config_type) +static int tile_pcie_open(int controller_id, int config_type) { char filename[32]; int fd; @@ -97,8 +98,7 @@ static int __devinit tile_pcie_open(int controller_id, int config_type) /* * Get the IRQ numbers from the HV and set up the handlers for them. */ -static int __devinit tile_init_irqs(int controller_id, - struct pci_controller *controller) +static int tile_init_irqs(int controller_id, struct pci_controller *controller) { char filename[32]; int fd; @@ -141,10 +141,15 @@ static int __devinit tile_init_irqs(int controller_id, * * Returns the number of controllers discovered. */ -int __devinit tile_pci_init(void) +int __init tile_pci_init(void) { int i; + if (!pci_probe) { + pr_info("PCI: disabled by boot argument\n"); + return 0; + } + pr_info("PCI: Searching for controllers...\n"); /* Re-init number of PCIe controllers to support hot-plug feature. */ @@ -193,7 +198,6 @@ int __devinit tile_pci_init(void) controller->hv_cfg_fd[0] = hv_cfg_fd0; controller->hv_cfg_fd[1] = hv_cfg_fd1; controller->hv_mem_fd = hv_mem_fd; - controller->first_busno = 0; controller->last_busno = 0xff; controller->ops = &tile_cfg_ops; @@ -237,7 +241,7 @@ static int tile_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) } -static void __devinit fixup_read_and_payload_sizes(void) +static void fixup_read_and_payload_sizes(void) { struct pci_dev *dev = NULL; int smallest_max_payload = 0x1; /* Tile maxes out at 256 bytes. */ @@ -245,39 +249,20 @@ static void __devinit fixup_read_and_payload_sizes(void) u16 new_values; /* Scan for the smallest maximum payload size. */ - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - int pcie_caps_offset; - u32 devcap; - int max_payload; - - pcie_caps_offset = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (pcie_caps_offset == 0) + for_each_pci_dev(dev) { + if (!pci_is_pcie(dev)) continue; - pci_read_config_dword(dev, pcie_caps_offset + PCI_EXP_DEVCAP, - &devcap); - max_payload = devcap & PCI_EXP_DEVCAP_PAYLOAD; - if (max_payload < smallest_max_payload) - smallest_max_payload = max_payload; + if (dev->pcie_mpss < smallest_max_payload) + smallest_max_payload = dev->pcie_mpss; } /* Now, set the max_payload_size for all devices to that value. */ new_values = (max_read_size << 12) | (smallest_max_payload << 5); - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - int pcie_caps_offset; - u16 devctl; - - pcie_caps_offset = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (pcie_caps_offset == 0) - continue; - - pci_read_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL, - &devctl); - devctl &= ~(PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ); - devctl |= new_values; - pci_write_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL, - devctl); - } + for_each_pci_dev(dev) + pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ, + new_values); } @@ -287,7 +272,7 @@ static void __devinit fixup_read_and_payload_sizes(void) * The controllers have been set up by the time we get here, by a call to * tile_pci_init. */ -int __devinit pcibios_init(void) +int __init pcibios_init(void) { int i; @@ -298,7 +283,7 @@ int __devinit pcibios_init(void) * known to require at least 20ms here, but we use a more * conservative value. */ - mdelay(250); + msleep(250); /* Scan all of the recorded PCI controllers. */ for (i = 0; i < TILE_NUM_PCIE; i++) { @@ -310,6 +295,7 @@ int __devinit pcibios_init(void) if (pci_scan_flags[i] == 0 && controllers[i].ops != NULL) { struct pci_controller *controller = &controllers[i]; struct pci_bus *bus; + LIST_HEAD(resources); if (tile_init_irqs(i, controller)) { pr_err("PCI: Could not initialize IRQs\n"); @@ -318,18 +304,12 @@ int __devinit pcibios_init(void) pr_info("PCI: initializing controller #%d\n", i); - /* - * This comes from the generic Linux PCI driver. - * - * It reads the PCI tree for this bus into the Linux - * data structures. - * - * This is inlined in linux/pci.h and calls into - * pci_scan_bus_parented() in probe.c. - */ - bus = pci_scan_bus(0, controller->ops, controller); + pci_add_resource(&resources, &ioport_resource); + pci_add_resource(&resources, &iomem_resource); + bus = pci_scan_root_bus(NULL, 0, controller->ops, + controller, &resources); controller->root_bus = bus; - controller->last_busno = bus->subordinate; + controller->last_busno = bus->busn_res.end; } } @@ -390,7 +370,7 @@ subsys_initcall(pcibios_init); /* * No bus fixups needed. */ -void __devinit pcibios_fixup_bus(struct pci_bus *bus) +void pcibios_fixup_bus(struct pci_bus *bus) { /* Nothing needs to be done. */ } @@ -400,25 +380,17 @@ void pcibios_set_master(struct pci_dev *dev) /* No special bus mastering setup handling. */ } -/* - * This can be called from the generic PCI layer, but doesn't need to - * do anything. - */ -char __devinit *pcibios_setup(char *str) +/* Process any "pci=" kernel boot arguments. */ +char *__init pcibios_setup(char *str) { - /* Nothing needs to be done. */ + if (!strcmp(str, "off")) { + pci_probe = 0; + return NULL; + } return str; } /* - * This is called from the generic Linux layer. - */ -void __devinit pcibios_update_irq(struct pci_dev *dev, int irq) -{ - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} - -/* * Enable memory and/or address decoding, as appropriate, for the * device described by the 'dev' struct. * @@ -487,11 +459,8 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) * specified bus & slot. */ -static int __devinit tile_cfg_read(struct pci_bus *bus, - unsigned int devfn, - int offset, - int size, - u32 *val) +static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset, + int size, u32 *val) { struct pci_controller *controller = bus->sysdata; int busnum = bus->number & 0xff; @@ -533,11 +502,8 @@ static int __devinit tile_cfg_read(struct pci_bus *bus, * See tile_cfg_read() for relevant comments. * Note that "val" is the value to write, not a pointer to that value. */ -static int __devinit tile_cfg_write(struct pci_bus *bus, - unsigned int devfn, - int offset, - int size, - u32 val) +static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset, + int size, u32 val) { struct pci_controller *controller = bus->sysdata; int busnum = bus->number & 0xff; diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c new file mode 100644 index 00000000000..e39f9c54280 --- /dev/null +++ b/arch/tile/kernel/pci_gx.c @@ -0,0 +1,1610 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/kernel.h> +#include <linux/mmzone.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/capability.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/irq.h> +#include <linux/msi.h> +#include <linux/io.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> + +#include <asm/processor.h> +#include <asm/sections.h> +#include <asm/byteorder.h> + +#include <gxio/iorpc_globals.h> +#include <gxio/kiorpc.h> +#include <gxio/trio.h> +#include <gxio/iorpc_trio.h> +#include <hv/drv_trio_intf.h> + +#include <arch/sim.h> + +/* + * This file containes the routines to search for PCI buses, + * enumerate the buses, and configure any attached devices. + */ + +#define DEBUG_PCI_CFG 0 + +#if DEBUG_PCI_CFG +#define TRACE_CFG_WR(size, val, bus, dev, func, offset) \ + pr_info("CFG WR %d-byte VAL %#x to bus %d dev %d func %d addr %u\n", \ + size, val, bus, dev, func, offset & 0xFFF); +#define TRACE_CFG_RD(size, val, bus, dev, func, offset) \ + pr_info("CFG RD %d-byte VAL %#x from bus %d dev %d func %d addr %u\n", \ + size, val, bus, dev, func, offset & 0xFFF); +#else +#define TRACE_CFG_WR(...) +#define TRACE_CFG_RD(...) +#endif + +static int pci_probe = 1; + +/* Information on the PCIe RC ports configuration. */ +static int pcie_rc[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES]; + +/* + * On some platforms with one or more Gx endpoint ports, we need to + * delay the PCIe RC port probe for a few seconds to work around + * a HW PCIe link-training bug. The exact delay is specified with + * a kernel boot argument in the form of "pcie_rc_delay=T,P,S", + * where T is the TRIO instance number, P is the port number and S is + * the delay in seconds. If the argument is specified, but the delay is + * not provided, the value will be DEFAULT_RC_DELAY. + */ +static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES]; + +/* Default number of seconds that the PCIe RC port probe can be delayed. */ +#define DEFAULT_RC_DELAY 10 + +/* The PCI I/O space size in each PCI domain. */ +#define IO_SPACE_SIZE 0x10000 + +/* Provide shorter versions of some very long constant names. */ +#define AUTO_CONFIG_RC \ + TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC +#define AUTO_CONFIG_RC_G1 \ + TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1 +#define AUTO_CONFIG_EP \ + TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT +#define AUTO_CONFIG_EP_G1 \ + TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1 + +/* Array of the PCIe ports configuration info obtained from the BIB. */ +struct pcie_trio_ports_property pcie_ports[TILEGX_NUM_TRIO]; + +/* Number of configured TRIO instances. */ +int num_trio_shims; + +/* All drivers share the TRIO contexts defined here. */ +gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO]; + +/* Pointer to an array of PCIe RC controllers. */ +struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES]; +int num_rc_controllers; + +static struct pci_ops tile_cfg_ops; + +/* Mask of CPUs that should receive PCIe interrupts. */ +static struct cpumask intr_cpus_map; + +/* We don't need to worry about the alignment of resources. */ +resource_size_t pcibios_align_resource(void *data, const struct resource *res, + resource_size_t size, + resource_size_t align) +{ + return res->start; +} +EXPORT_SYMBOL(pcibios_align_resource); + +/* + * Pick a CPU to receive and handle the PCIe interrupts, based on the IRQ #. + * For now, we simply send interrupts to non-dataplane CPUs. + * We may implement methods to allow user to specify the target CPUs, + * e.g. via boot arguments. + */ +static int tile_irq_cpu(int irq) +{ + unsigned int count; + int i = 0; + int cpu; + + count = cpumask_weight(&intr_cpus_map); + if (unlikely(count == 0)) { + pr_warning("intr_cpus_map empty, interrupts will be" + " delievered to dataplane tiles\n"); + return irq % (smp_height * smp_width); + } + + count = irq % count; + for_each_cpu(cpu, &intr_cpus_map) { + if (i++ == count) + break; + } + return cpu; +} + +/* Open a file descriptor to the TRIO shim. */ +static int tile_pcie_open(int trio_index) +{ + gxio_trio_context_t *context = &trio_contexts[trio_index]; + int ret; + int mac; + + /* This opens a file descriptor to the TRIO shim. */ + ret = gxio_trio_init(context, trio_index); + if (ret < 0) + goto gxio_trio_init_failure; + + /* Allocate an ASID for the kernel. */ + ret = gxio_trio_alloc_asids(context, 1, 0, 0); + if (ret < 0) { + pr_err("PCI: ASID alloc failure on TRIO %d, give up\n", + trio_index); + goto asid_alloc_failure; + } + + context->asid = ret; + +#ifdef USE_SHARED_PCIE_CONFIG_REGION + /* + * Alloc a PIO region for config access, shared by all MACs per TRIO. + * This shouldn't fail since the kernel is supposed to the first + * client of the TRIO's PIO regions. + */ + ret = gxio_trio_alloc_pio_regions(context, 1, 0, 0); + if (ret < 0) { + pr_err("PCI: CFG PIO alloc failure on TRIO %d, give up\n", + trio_index); + goto pio_alloc_failure; + } + + context->pio_cfg_index = ret; + + /* + * For PIO CFG, the bus_address_hi parameter is 0. The mac parameter + * is also 0 because it is specified in PIO_REGION_SETUP_CFG_ADDR. + */ + ret = gxio_trio_init_pio_region_aux(context, context->pio_cfg_index, + 0, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE); + if (ret < 0) { + pr_err("PCI: CFG PIO init failure on TRIO %d, give up\n", + trio_index); + goto pio_alloc_failure; + } +#endif + + /* Get the properties of the PCIe ports on this TRIO instance. */ + ret = gxio_trio_get_port_property(context, &pcie_ports[trio_index]); + if (ret < 0) { + pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d," + " on TRIO %d\n", ret, trio_index); + goto get_port_property_failure; + } + + context->mmio_base_mac = + iorpc_ioremap(context->fd, 0, HV_TRIO_CONFIG_IOREMAP_SIZE); + if (context->mmio_base_mac == NULL) { + pr_err("PCI: TRIO config space mapping failure, error %d," + " on TRIO %d\n", ret, trio_index); + ret = -ENOMEM; + + goto trio_mmio_mapping_failure; + } + + /* Check the port strap state which will override the BIB setting. */ + for (mac = 0; mac < TILEGX_TRIO_PCIES; mac++) { + TRIO_PCIE_INTFC_PORT_CONFIG_t port_config; + unsigned int reg_offset; + + /* Ignore ports that are not specified in the BIB. */ + if (!pcie_ports[trio_index].ports[mac].allow_rc && + !pcie_ports[trio_index].ports[mac].allow_ep) + continue; + + reg_offset = + (TRIO_PCIE_INTFC_PORT_CONFIG << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + port_config.word = + __gxio_mmio_read(context->mmio_base_mac + reg_offset); + + if (port_config.strap_state != AUTO_CONFIG_RC && + port_config.strap_state != AUTO_CONFIG_RC_G1) { + /* + * If this is really intended to be an EP port, record + * it so that the endpoint driver will know about it. + */ + if (port_config.strap_state == AUTO_CONFIG_EP || + port_config.strap_state == AUTO_CONFIG_EP_G1) + pcie_ports[trio_index].ports[mac].allow_ep = 1; + } + } + + return ret; + +trio_mmio_mapping_failure: +get_port_property_failure: +asid_alloc_failure: +#ifdef USE_SHARED_PCIE_CONFIG_REGION +pio_alloc_failure: +#endif + hv_dev_close(context->fd); +gxio_trio_init_failure: + context->fd = -1; + + return ret; +} + +static int __init tile_trio_init(void) +{ + int i; + + /* We loop over all the TRIO shims. */ + for (i = 0; i < TILEGX_NUM_TRIO; i++) { + if (tile_pcie_open(i) < 0) + continue; + num_trio_shims++; + } + + return 0; +} +postcore_initcall(tile_trio_init); + +static void tilegx_legacy_irq_ack(struct irq_data *d) +{ + __insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq); +} + +static void tilegx_legacy_irq_mask(struct irq_data *d) +{ + __insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq); +} + +static void tilegx_legacy_irq_unmask(struct irq_data *d) +{ + __insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq); +} + +static struct irq_chip tilegx_legacy_irq_chip = { + .name = "tilegx_legacy_irq", + .irq_ack = tilegx_legacy_irq_ack, + .irq_mask = tilegx_legacy_irq_mask, + .irq_unmask = tilegx_legacy_irq_unmask, + + /* TBD: support set_affinity. */ +}; + +/* + * This is a wrapper function of the kernel level-trigger interrupt + * handler handle_level_irq() for PCI legacy interrupts. The TRIO + * is configured such that only INTx Assert interrupts are proxied + * to Linux which just calls handle_level_irq() after clearing the + * MAC INTx Assert status bit associated with this interrupt. + */ +static void trio_handle_level_irq(unsigned int irq, struct irq_desc *desc) +{ + struct pci_controller *controller = irq_desc_get_handler_data(desc); + gxio_trio_context_t *trio_context = controller->trio; + uint64_t intx = (uint64_t)irq_desc_get_chip_data(desc); + int mac = controller->mac; + unsigned int reg_offset; + uint64_t level_mask; + + handle_level_irq(irq, desc); + + /* + * Clear the INTx Level status, otherwise future interrupts are + * not sent. + */ + reg_offset = (TRIO_PCIE_INTFC_MAC_INT_STS << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + level_mask = TRIO_PCIE_INTFC_MAC_INT_STS__INT_LEVEL_MASK << intx; + + __gxio_mmio_write(trio_context->mmio_base_mac + reg_offset, level_mask); +} + +/* + * Create kernel irqs and set up the handlers for the legacy interrupts. + * Also some minimum initialization for the MSI support. + */ +static int tile_init_irqs(struct pci_controller *controller) +{ + int i; + int j; + int irq; + int result; + + cpumask_copy(&intr_cpus_map, cpu_online_mask); + + + for (i = 0; i < 4; i++) { + gxio_trio_context_t *context = controller->trio; + int cpu; + + /* Ask the kernel to allocate an IRQ. */ + irq = irq_alloc_hwirq(-1); + if (!irq) { + pr_err("PCI: no free irq vectors, failed for %d\n", i); + goto free_irqs; + } + controller->irq_intx_table[i] = irq; + + /* Distribute the 4 IRQs to different tiles. */ + cpu = tile_irq_cpu(irq); + + /* Configure the TRIO intr binding for this IRQ. */ + result = gxio_trio_config_legacy_intr(context, cpu_x(cpu), + cpu_y(cpu), KERNEL_PL, + irq, controller->mac, i); + if (result < 0) { + pr_err("PCI: MAC intx config failed for %d\n", i); + + goto free_irqs; + } + + /* Register the IRQ handler with the kernel. */ + irq_set_chip_and_handler(irq, &tilegx_legacy_irq_chip, + trio_handle_level_irq); + irq_set_chip_data(irq, (void *)(uint64_t)i); + irq_set_handler_data(irq, controller); + } + + return 0; + +free_irqs: + for (j = 0; j < i; j++) + irq_free_hwirq(controller->irq_intx_table[j]); + + return -1; +} + +/* + * Return 1 if the port is strapped to operate in RC mode. + */ +static int +strapped_for_rc(gxio_trio_context_t *trio_context, int mac) +{ + TRIO_PCIE_INTFC_PORT_CONFIG_t port_config; + unsigned int reg_offset; + + /* Check the port configuration. */ + reg_offset = + (TRIO_PCIE_INTFC_PORT_CONFIG << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + port_config.word = + __gxio_mmio_read(trio_context->mmio_base_mac + reg_offset); + + if (port_config.strap_state == AUTO_CONFIG_RC || + port_config.strap_state == AUTO_CONFIG_RC_G1) + return 1; + else + return 0; +} + +/* + * Find valid controllers and fill in pci_controller structs for each + * of them. + * + * Return the number of controllers discovered. + */ +int __init tile_pci_init(void) +{ + int ctl_index = 0; + int i, j; + + if (!pci_probe) { + pr_info("PCI: disabled by boot argument\n"); + return 0; + } + + pr_info("PCI: Searching for controllers...\n"); + + if (num_trio_shims == 0 || sim_is_simulator()) + return 0; + + /* + * Now determine which PCIe ports are configured to operate in RC + * mode. There is a differece in the port configuration capability + * between the Gx36 and Gx72 devices. + * + * The Gx36 has configuration capability for each of the 3 PCIe + * interfaces (disable, auto endpoint, auto RC, etc.). + * On the Gx72, you can only select one of the 3 PCIe interfaces per + * TRIO to train automatically. Further, the allowable training modes + * are reduced to four options (auto endpoint, auto RC, stream x1, + * stream x4). + * + * For Gx36 ports, it must be allowed to be in RC mode by the + * Board Information Block, and the hardware strapping pins must be + * set to RC mode. + * + * For Gx72 ports, the port will operate in RC mode if either of the + * following is true: + * 1. It is allowed to be in RC mode by the Board Information Block, + * and the BIB doesn't allow the EP mode. + * 2. It is allowed to be in either the RC or the EP mode by the BIB, + * and the hardware strapping pin is set to RC mode. + */ + for (i = 0; i < TILEGX_NUM_TRIO; i++) { + gxio_trio_context_t *context = &trio_contexts[i]; + + if (context->fd < 0) + continue; + + for (j = 0; j < TILEGX_TRIO_PCIES; j++) { + int is_rc = 0; + + if (pcie_ports[i].is_gx72 && + pcie_ports[i].ports[j].allow_rc) { + if (!pcie_ports[i].ports[j].allow_ep || + strapped_for_rc(context, j)) + is_rc = 1; + } else if (pcie_ports[i].ports[j].allow_rc && + strapped_for_rc(context, j)) { + is_rc = 1; + } + if (is_rc) { + pcie_rc[i][j] = 1; + num_rc_controllers++; + } + } + } + + /* Return if no PCIe ports are configured to operate in RC mode. */ + if (num_rc_controllers == 0) + return 0; + + /* Set the TRIO pointer and MAC index for each PCIe RC port. */ + for (i = 0; i < TILEGX_NUM_TRIO; i++) { + for (j = 0; j < TILEGX_TRIO_PCIES; j++) { + if (pcie_rc[i][j]) { + pci_controllers[ctl_index].trio = + &trio_contexts[i]; + pci_controllers[ctl_index].mac = j; + pci_controllers[ctl_index].trio_index = i; + ctl_index++; + if (ctl_index == num_rc_controllers) + goto out; + } + } + } + +out: + /* Configure each PCIe RC port. */ + for (i = 0; i < num_rc_controllers; i++) { + + /* Configure the PCIe MAC to run in RC mode. */ + struct pci_controller *controller = &pci_controllers[i]; + + controller->index = i; + controller->ops = &tile_cfg_ops; + + controller->io_space.start = PCIBIOS_MIN_IO + + (i * IO_SPACE_SIZE); + controller->io_space.end = controller->io_space.start + + IO_SPACE_SIZE - 1; + BUG_ON(controller->io_space.end > IO_SPACE_LIMIT); + controller->io_space.flags = IORESOURCE_IO; + snprintf(controller->io_space_name, + sizeof(controller->io_space_name), + "PCI I/O domain %d", i); + controller->io_space.name = controller->io_space_name; + + /* + * The PCI memory resource is located above the PA space. + * For every host bridge, the BAR window or the MMIO aperture + * is in range [3GB, 4GB - 1] of a 4GB space beyond the + * PA space. + */ + controller->mem_offset = TILE_PCI_MEM_START + + (i * TILE_PCI_BAR_WINDOW_TOP); + controller->mem_space.start = controller->mem_offset + + TILE_PCI_BAR_WINDOW_TOP - TILE_PCI_BAR_WINDOW_SIZE; + controller->mem_space.end = controller->mem_offset + + TILE_PCI_BAR_WINDOW_TOP - 1; + controller->mem_space.flags = IORESOURCE_MEM; + snprintf(controller->mem_space_name, + sizeof(controller->mem_space_name), + "PCI mem domain %d", i); + controller->mem_space.name = controller->mem_space_name; + } + + return num_rc_controllers; +} + +/* + * (pin - 1) converts from the PCI standard's [1:4] convention to + * a normal [0:3] range. + */ +static int tile_map_irq(const struct pci_dev *dev, u8 device, u8 pin) +{ + struct pci_controller *controller = + (struct pci_controller *)dev->sysdata; + return controller->irq_intx_table[pin - 1]; +} + +static void fixup_read_and_payload_sizes(struct pci_controller *controller) +{ + gxio_trio_context_t *trio_context = controller->trio; + struct pci_bus *root_bus = controller->root_bus; + TRIO_PCIE_RC_DEVICE_CONTROL_t dev_control; + TRIO_PCIE_RC_DEVICE_CAP_t rc_dev_cap; + unsigned int reg_offset; + struct pci_bus *child; + int mac; + int err; + + mac = controller->mac; + + /* Set our max read request size to be 4KB. */ + reg_offset = + (TRIO_PCIE_RC_DEVICE_CONTROL << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac + + reg_offset); + dev_control.max_read_req_sz = 5; + __gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset, + dev_control.word); + + /* + * Set the max payload size supported by this Gx PCIe MAC. + * Though Gx PCIe supports Max Payload Size of up to 1024 bytes, + * experiments have shown that setting MPS to 256 yields the + * best performance. + */ + reg_offset = + (TRIO_PCIE_RC_DEVICE_CAP << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + rc_dev_cap.word = __gxio_mmio_read32(trio_context->mmio_base_mac + + reg_offset); + rc_dev_cap.mps_sup = 1; + __gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset, + rc_dev_cap.word); + + /* Configure PCI Express MPS setting. */ + list_for_each_entry(child, &root_bus->children, node) + pcie_bus_configure_settings(child); + + /* + * Set the mac_config register in trio based on the MPS/MRS of the link. + */ + reg_offset = + (TRIO_PCIE_RC_DEVICE_CONTROL << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac + + reg_offset); + + err = gxio_trio_set_mps_mrs(trio_context, + dev_control.max_payload_size, + dev_control.max_read_req_sz, + mac); + if (err < 0) { + pr_err("PCI: PCIE_CONFIGURE_MAC_MPS_MRS failure, " + "MAC %d on TRIO %d\n", + mac, controller->trio_index); + } +} + +static int setup_pcie_rc_delay(char *str) +{ + unsigned long delay = 0; + unsigned long trio_index; + unsigned long mac; + + if (str == NULL || !isdigit(*str)) + return -EINVAL; + trio_index = simple_strtoul(str, (char **)&str, 10); + if (trio_index >= TILEGX_NUM_TRIO) + return -EINVAL; + + if (*str != ',') + return -EINVAL; + + str++; + if (!isdigit(*str)) + return -EINVAL; + mac = simple_strtoul(str, (char **)&str, 10); + if (mac >= TILEGX_TRIO_PCIES) + return -EINVAL; + + if (*str != '\0') { + if (*str != ',') + return -EINVAL; + + str++; + if (!isdigit(*str)) + return -EINVAL; + delay = simple_strtoul(str, (char **)&str, 10); + } + + rc_delay[trio_index][mac] = delay ? : DEFAULT_RC_DELAY; + return 0; +} +early_param("pcie_rc_delay", setup_pcie_rc_delay); + +/* PCI initialization entry point, called by subsys_initcall. */ +int __init pcibios_init(void) +{ + resource_size_t offset; + LIST_HEAD(resources); + int next_busno; + int i; + + tile_pci_init(); + + if (num_rc_controllers == 0) + return 0; + + /* + * Delay a bit in case devices aren't ready. Some devices are + * known to require at least 20ms here, but we use a more + * conservative value. + */ + msleep(250); + + /* Scan all of the recorded PCI controllers. */ + for (next_busno = 0, i = 0; i < num_rc_controllers; i++) { + struct pci_controller *controller = &pci_controllers[i]; + gxio_trio_context_t *trio_context = controller->trio; + TRIO_PCIE_INTFC_PORT_STATUS_t port_status; + TRIO_PCIE_INTFC_TX_FIFO_CTL_t tx_fifo_ctl; + struct pci_bus *bus; + unsigned int reg_offset; + unsigned int class_code_revision; + int trio_index; + int mac; + int ret; + + if (trio_context->fd < 0) + continue; + + trio_index = controller->trio_index; + mac = controller->mac; + + /* + * Check for PCIe link-up status to decide if we need + * to force the link to come up. + */ + reg_offset = + (TRIO_PCIE_INTFC_PORT_STATUS << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + port_status.word = + __gxio_mmio_read(trio_context->mmio_base_mac + + reg_offset); + if (!port_status.dl_up) { + if (rc_delay[trio_index][mac]) { + pr_info("Delaying PCIe RC TRIO init %d sec" + " on MAC %d on TRIO %d\n", + rc_delay[trio_index][mac], mac, + trio_index); + msleep(rc_delay[trio_index][mac] * 1000); + } + ret = gxio_trio_force_rc_link_up(trio_context, mac); + if (ret < 0) + pr_err("PCI: PCIE_FORCE_LINK_UP failure, " + "MAC %d on TRIO %d\n", mac, trio_index); + } + + pr_info("PCI: Found PCI controller #%d on TRIO %d MAC %d\n", i, + trio_index, controller->mac); + + /* Delay the bus probe if needed. */ + if (rc_delay[trio_index][mac]) { + pr_info("Delaying PCIe RC bus enumerating %d sec" + " on MAC %d on TRIO %d\n", + rc_delay[trio_index][mac], mac, + trio_index); + msleep(rc_delay[trio_index][mac] * 1000); + } else { + /* + * Wait a bit here because some EP devices + * take longer to come up. + */ + msleep(1000); + } + + /* Check for PCIe link-up status again. */ + port_status.word = + __gxio_mmio_read(trio_context->mmio_base_mac + + reg_offset); + if (!port_status.dl_up) { + if (pcie_ports[trio_index].ports[mac].removable) { + pr_info("PCI: link is down, MAC %d on TRIO %d\n", + mac, trio_index); + pr_info("This is expected if no PCIe card" + " is connected to this link\n"); + } else + pr_err("PCI: link is down, MAC %d on TRIO %d\n", + mac, trio_index); + continue; + } + + /* + * Ensure that the link can come out of L1 power down state. + * Strictly speaking, this is needed only in the case of + * heavy RC-initiated DMAs. + */ + reg_offset = + (TRIO_PCIE_INTFC_TX_FIFO_CTL << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + tx_fifo_ctl.word = + __gxio_mmio_read(trio_context->mmio_base_mac + + reg_offset); + tx_fifo_ctl.min_p_credits = 0; + __gxio_mmio_write(trio_context->mmio_base_mac + reg_offset, + tx_fifo_ctl.word); + + /* + * Change the device ID so that Linux bus crawl doesn't confuse + * the internal bridge with any Tilera endpoints. + */ + reg_offset = + (TRIO_PCIE_RC_DEVICE_ID_VEN_ID << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + __gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset, + (TILERA_GX36_RC_DEV_ID << + TRIO_PCIE_RC_DEVICE_ID_VEN_ID__DEV_ID_SHIFT) | + TILERA_VENDOR_ID); + + /* Set the internal P2P bridge class code. */ + reg_offset = + (TRIO_PCIE_RC_REVISION_ID << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD << + TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + class_code_revision = + __gxio_mmio_read32(trio_context->mmio_base_mac + + reg_offset); + class_code_revision = (class_code_revision & 0xff) | + (PCI_CLASS_BRIDGE_PCI << 16); + + __gxio_mmio_write32(trio_context->mmio_base_mac + + reg_offset, class_code_revision); + +#ifdef USE_SHARED_PCIE_CONFIG_REGION + + /* Map in the MMIO space for the PIO region. */ + offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index) | + (((unsigned long long)mac) << + TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT); + +#else + + /* Alloc a PIO region for PCI config access per MAC. */ + ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0); + if (ret < 0) { + pr_err("PCI: PCI CFG PIO alloc failure for mac %d " + "on TRIO %d, give up\n", mac, trio_index); + + continue; + } + + trio_context->pio_cfg_index[mac] = ret; + + /* For PIO CFG, the bus_address_hi parameter is 0. */ + ret = gxio_trio_init_pio_region_aux(trio_context, + trio_context->pio_cfg_index[mac], + mac, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE); + if (ret < 0) { + pr_err("PCI: PCI CFG PIO init failure for mac %d " + "on TRIO %d, give up\n", mac, trio_index); + + continue; + } + + offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index[mac]) | + (((unsigned long long)mac) << + TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT); + +#endif + + /* + * To save VMALLOC space, we take advantage of the fact that + * bit 29 in the PIO CFG address format is reserved 0. With + * TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT being 30, + * this cuts VMALLOC space usage from 1GB to 512MB per mac. + */ + trio_context->mmio_base_pio_cfg[mac] = + iorpc_ioremap(trio_context->fd, offset, (1UL << + (TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT - 1))); + if (trio_context->mmio_base_pio_cfg[mac] == NULL) { + pr_err("PCI: PIO map failure for mac %d on TRIO %d\n", + mac, trio_index); + + continue; + } + + /* Initialize the PCIe interrupts. */ + if (tile_init_irqs(controller)) { + pr_err("PCI: IRQs init failure for mac %d on TRIO %d\n", + mac, trio_index); + + continue; + } + + /* + * The PCI memory resource is located above the PA space. + * The memory range for the PCI root bus should not overlap + * with the physical RAM. + */ + pci_add_resource_offset(&resources, &controller->mem_space, + controller->mem_offset); + pci_add_resource(&resources, &controller->io_space); + controller->first_busno = next_busno; + bus = pci_scan_root_bus(NULL, next_busno, controller->ops, + controller, &resources); + controller->root_bus = bus; + next_busno = bus->busn_res.end + 1; + } + + /* Do machine dependent PCI interrupt routing */ + pci_fixup_irqs(pci_common_swizzle, tile_map_irq); + + /* + * This comes from the generic Linux PCI driver. + * + * It allocates all of the resources (I/O memory, etc) + * associated with the devices read in above. + */ + pci_assign_unassigned_resources(); + + /* Record the I/O resources in the PCI controller structure. */ + for (i = 0; i < num_rc_controllers; i++) { + struct pci_controller *controller = &pci_controllers[i]; + gxio_trio_context_t *trio_context = controller->trio; + struct pci_bus *root_bus = pci_controllers[i].root_bus; + int ret; + int j; + + /* + * Skip controllers that are not properly initialized or + * have down links. + */ + if (root_bus == NULL) + continue; + + /* Configure the max_payload_size values for this domain. */ + fixup_read_and_payload_sizes(controller); + + /* Alloc a PIO region for PCI memory access for each RC port. */ + ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0); + if (ret < 0) { + pr_err("PCI: MEM PIO alloc failure on TRIO %d mac %d, " + "give up\n", controller->trio_index, + controller->mac); + + continue; + } + + controller->pio_mem_index = ret; + + /* + * For PIO MEM, the bus_address_hi parameter is hard-coded 0 + * because we always assign 32-bit PCI bus BAR ranges. + */ + ret = gxio_trio_init_pio_region_aux(trio_context, + controller->pio_mem_index, + controller->mac, + 0, + 0); + if (ret < 0) { + pr_err("PCI: MEM PIO init failure on TRIO %d mac %d, " + "give up\n", controller->trio_index, + controller->mac); + + continue; + } + +#ifdef CONFIG_TILE_PCI_IO + /* + * Alloc a PIO region for PCI I/O space access for each RC port. + */ + ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0); + if (ret < 0) { + pr_err("PCI: I/O PIO alloc failure on TRIO %d mac %d, " + "give up\n", controller->trio_index, + controller->mac); + + continue; + } + + controller->pio_io_index = ret; + + /* + * For PIO IO, the bus_address_hi parameter is hard-coded 0 + * because PCI I/O address space is 32-bit. + */ + ret = gxio_trio_init_pio_region_aux(trio_context, + controller->pio_io_index, + controller->mac, + 0, + HV_TRIO_PIO_FLAG_IO_SPACE); + if (ret < 0) { + pr_err("PCI: I/O PIO init failure on TRIO %d mac %d, " + "give up\n", controller->trio_index, + controller->mac); + + continue; + } +#endif + + /* + * Configure a Mem-Map region for each memory controller so + * that Linux can map all of its PA space to the PCI bus. + * Use the IOMMU to handle hash-for-home memory. + */ + for_each_online_node(j) { + unsigned long start_pfn = node_start_pfn[j]; + unsigned long end_pfn = node_end_pfn[j]; + unsigned long nr_pages = end_pfn - start_pfn; + + ret = gxio_trio_alloc_memory_maps(trio_context, 1, 0, + 0); + if (ret < 0) { + pr_err("PCI: Mem-Map alloc failure on TRIO %d " + "mac %d for MC %d, give up\n", + controller->trio_index, + controller->mac, j); + + goto alloc_mem_map_failed; + } + + controller->mem_maps[j] = ret; + + /* + * Initialize the Mem-Map and the I/O MMU so that all + * the physical memory can be accessed by the endpoint + * devices. The base bus address is set to the base CPA + * of this memory controller plus an offset (see pci.h). + * The region's base VA is set to the base CPA. The + * I/O MMU table essentially translates the CPA to + * the real PA. Implicitly, for node 0, we create + * a separate Mem-Map region that serves as the inbound + * window for legacy 32-bit devices. This is a direct + * map of the low 4GB CPA space. + */ + ret = gxio_trio_init_memory_map_mmu_aux(trio_context, + controller->mem_maps[j], + start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT, + trio_context->asid, + controller->mac, + (start_pfn << PAGE_SHIFT) + + TILE_PCI_MEM_MAP_BASE_OFFSET, + j, + GXIO_TRIO_ORDER_MODE_UNORDERED); + if (ret < 0) { + pr_err("PCI: Mem-Map init failure on TRIO %d " + "mac %d for MC %d, give up\n", + controller->trio_index, + controller->mac, j); + + goto alloc_mem_map_failed; + } + continue; + +alloc_mem_map_failed: + break; + } + } + + return 0; +} +subsys_initcall(pcibios_init); + +/* No bus fixups needed. */ +void pcibios_fixup_bus(struct pci_bus *bus) +{ +} + +/* Process any "pci=" kernel boot arguments. */ +char *__init pcibios_setup(char *str) +{ + if (!strcmp(str, "off")) { + pci_probe = 0; + return NULL; + } + return str; +} + +/* + * Called for each device after PCI setup is done. + * We initialize the PCI device capabilities conservatively, assuming that + * all devices can only address the 32-bit DMA space. The exception here is + * that the device dma_offset is set to the value that matches the 64-bit + * capable devices. This is OK because dma_offset is not used by legacy + * dma_ops, nor by the hybrid dma_ops's streaming DMAs, which are 64-bit ops. + * This implementation matches the kernel design of setting PCI devices' + * coherent_dma_mask to 0xffffffffull by default, allowing the device drivers + * to skip calling pci_set_consistent_dma_mask(DMA_BIT_MASK(32)). + */ +static void pcibios_fixup_final(struct pci_dev *pdev) +{ + set_dma_ops(&pdev->dev, gx_legacy_pci_dma_map_ops); + set_dma_offset(&pdev->dev, TILE_PCI_MEM_MAP_BASE_OFFSET); + pdev->dev.archdata.max_direct_dma_addr = + TILE_PCI_MAX_DIRECT_DMA_ADDRESS; + pdev->dev.coherent_dma_mask = TILE_PCI_MAX_DIRECT_DMA_ADDRESS; +} +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_final); + +/* Map a PCI MMIO bus address into VA space. */ +void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) +{ + struct pci_controller *controller = NULL; + resource_size_t bar_start; + resource_size_t bar_end; + resource_size_t offset; + resource_size_t start; + resource_size_t end; + int trio_fd; + int i; + + start = phys_addr; + end = phys_addr + size - 1; + + /* + * By searching phys_addr in each controller's mem_space, we can + * determine the controller that should accept the PCI memory access. + */ + for (i = 0; i < num_rc_controllers; i++) { + /* + * Skip controllers that are not properly initialized or + * have down links. + */ + if (pci_controllers[i].root_bus == NULL) + continue; + + bar_start = pci_controllers[i].mem_space.start; + bar_end = pci_controllers[i].mem_space.end; + + if ((start >= bar_start) && (end <= bar_end)) { + controller = &pci_controllers[i]; + break; + } + } + + if (controller == NULL) + return NULL; + + trio_fd = controller->trio->fd; + + /* Convert the resource start to the bus address offset. */ + start = phys_addr - controller->mem_offset; + + offset = HV_TRIO_PIO_OFFSET(controller->pio_mem_index) + start; + + /* We need to keep the PCI bus address's in-page offset in the VA. */ + return iorpc_ioremap(trio_fd, offset, size) + + (start & (PAGE_SIZE - 1)); +} +EXPORT_SYMBOL(ioremap); + +#ifdef CONFIG_TILE_PCI_IO +/* Map a PCI I/O address into VA space. */ +void __iomem *ioport_map(unsigned long port, unsigned int size) +{ + struct pci_controller *controller = NULL; + resource_size_t bar_start; + resource_size_t bar_end; + resource_size_t offset; + resource_size_t start; + resource_size_t end; + int trio_fd; + int i; + + start = port; + end = port + size - 1; + + /* + * By searching the port in each controller's io_space, we can + * determine the controller that should accept the PCI I/O access. + */ + for (i = 0; i < num_rc_controllers; i++) { + /* + * Skip controllers that are not properly initialized or + * have down links. + */ + if (pci_controllers[i].root_bus == NULL) + continue; + + bar_start = pci_controllers[i].io_space.start; + bar_end = pci_controllers[i].io_space.end; + + if ((start >= bar_start) && (end <= bar_end)) { + controller = &pci_controllers[i]; + break; + } + } + + if (controller == NULL) + return NULL; + + trio_fd = controller->trio->fd; + + /* Convert the resource start to the bus address offset. */ + port -= controller->io_space.start; + + offset = HV_TRIO_PIO_OFFSET(controller->pio_io_index) + port; + + /* We need to keep the PCI bus address's in-page offset in the VA. */ + return iorpc_ioremap(trio_fd, offset, size) + (port & (PAGE_SIZE - 1)); +} +EXPORT_SYMBOL(ioport_map); + +void ioport_unmap(void __iomem *addr) +{ + iounmap(addr); +} +EXPORT_SYMBOL(ioport_unmap); +#endif + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + iounmap(addr); +} +EXPORT_SYMBOL(pci_iounmap); + +/**************************************************************** + * + * Tile PCI config space read/write routines + * + ****************************************************************/ + +/* + * These are the normal read and write ops + * These are expanded with macros from pci_bus_read_config_byte() etc. + * + * devfn is the combined PCI device & function. + * + * offset is in bytes, from the start of config space for the + * specified bus & device. + */ +static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset, + int size, u32 *val) +{ + struct pci_controller *controller = bus->sysdata; + gxio_trio_context_t *trio_context = controller->trio; + int busnum = bus->number & 0xff; + int device = PCI_SLOT(devfn); + int function = PCI_FUNC(devfn); + int config_type = 1; + TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR_t cfg_addr; + void *mmio_addr; + + /* + * Map all accesses to the local device on root bus into the + * MMIO space of the MAC. Accesses to the downstream devices + * go to the PIO space. + */ + if (pci_is_root_bus(bus)) { + if (device == 0) { + /* + * This is the internal downstream P2P bridge, + * access directly. + */ + unsigned int reg_offset; + + reg_offset = ((offset & 0xFFF) << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_PROTECTED + << TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (controller->mac << + TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + mmio_addr = trio_context->mmio_base_mac + reg_offset; + + goto valid_device; + + } else { + /* + * We fake an empty device for (device > 0), + * since there is only one device on bus 0. + */ + goto invalid_device; + } + } + + /* + * Accesses to the directly attached device have to be + * sent as type-0 configs. + */ + if (busnum == (controller->first_busno + 1)) { + /* + * There is only one device off of our built-in P2P bridge. + */ + if (device != 0) + goto invalid_device; + + config_type = 0; + } + + cfg_addr.word = 0; + cfg_addr.reg_addr = (offset & 0xFFF); + cfg_addr.fn = function; + cfg_addr.dev = device; + cfg_addr.bus = busnum; + cfg_addr.type = config_type; + + /* + * Note that we don't set the mac field in cfg_addr because the + * mapping is per port. + */ + mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] + + cfg_addr.word; + +valid_device: + + switch (size) { + case 4: + *val = __gxio_mmio_read32(mmio_addr); + break; + + case 2: + *val = __gxio_mmio_read16(mmio_addr); + break; + + case 1: + *val = __gxio_mmio_read8(mmio_addr); + break; + + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + + TRACE_CFG_RD(size, *val, busnum, device, function, offset); + + return 0; + +invalid_device: + + switch (size) { + case 4: + *val = 0xFFFFFFFF; + break; + + case 2: + *val = 0xFFFF; + break; + + case 1: + *val = 0xFF; + break; + + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + + return 0; +} + + +/* + * See tile_cfg_read() for relevent comments. + * Note that "val" is the value to write, not a pointer to that value. + */ +static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset, + int size, u32 val) +{ + struct pci_controller *controller = bus->sysdata; + gxio_trio_context_t *trio_context = controller->trio; + int busnum = bus->number & 0xff; + int device = PCI_SLOT(devfn); + int function = PCI_FUNC(devfn); + int config_type = 1; + TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR_t cfg_addr; + void *mmio_addr; + u32 val_32 = (u32)val; + u16 val_16 = (u16)val; + u8 val_8 = (u8)val; + + /* + * Map all accesses to the local device on root bus into the + * MMIO space of the MAC. Accesses to the downstream devices + * go to the PIO space. + */ + if (pci_is_root_bus(bus)) { + if (device == 0) { + /* + * This is the internal downstream P2P bridge, + * access directly. + */ + unsigned int reg_offset; + + reg_offset = ((offset & 0xFFF) << + TRIO_CFG_REGION_ADDR__REG_SHIFT) | + (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_PROTECTED + << TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) | + (controller->mac << + TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT); + + mmio_addr = trio_context->mmio_base_mac + reg_offset; + + goto valid_device; + + } else { + /* + * We fake an empty device for (device > 0), + * since there is only one device on bus 0. + */ + goto invalid_device; + } + } + + /* + * Accesses to the directly attached device have to be + * sent as type-0 configs. + */ + if (busnum == (controller->first_busno + 1)) { + /* + * There is only one device off of our built-in P2P bridge. + */ + if (device != 0) + goto invalid_device; + + config_type = 0; + } + + cfg_addr.word = 0; + cfg_addr.reg_addr = (offset & 0xFFF); + cfg_addr.fn = function; + cfg_addr.dev = device; + cfg_addr.bus = busnum; + cfg_addr.type = config_type; + + /* + * Note that we don't set the mac field in cfg_addr because the + * mapping is per port. + */ + mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] + + cfg_addr.word; + +valid_device: + + switch (size) { + case 4: + __gxio_mmio_write32(mmio_addr, val_32); + TRACE_CFG_WR(size, val_32, busnum, device, function, offset); + break; + + case 2: + __gxio_mmio_write16(mmio_addr, val_16); + TRACE_CFG_WR(size, val_16, busnum, device, function, offset); + break; + + case 1: + __gxio_mmio_write8(mmio_addr, val_8); + TRACE_CFG_WR(size, val_8, busnum, device, function, offset); + break; + + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + +invalid_device: + + return 0; +} + + +static struct pci_ops tile_cfg_ops = { + .read = tile_cfg_read, + .write = tile_cfg_write, +}; + + +/* MSI support starts here. */ +static unsigned int tilegx_msi_startup(struct irq_data *d) +{ + if (d->msi_desc) + unmask_msi_irq(d); + + return 0; +} + +static void tilegx_msi_ack(struct irq_data *d) +{ + __insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq); +} + +static void tilegx_msi_mask(struct irq_data *d) +{ + mask_msi_irq(d); + __insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq); +} + +static void tilegx_msi_unmask(struct irq_data *d) +{ + __insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq); + unmask_msi_irq(d); +} + +static struct irq_chip tilegx_msi_chip = { + .name = "tilegx_msi", + .irq_startup = tilegx_msi_startup, + .irq_ack = tilegx_msi_ack, + .irq_mask = tilegx_msi_mask, + .irq_unmask = tilegx_msi_unmask, + + /* TBD: support set_affinity. */ +}; + +int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) +{ + struct pci_controller *controller; + gxio_trio_context_t *trio_context; + struct msi_msg msg; + int default_irq; + uint64_t mem_map_base; + uint64_t mem_map_limit; + u64 msi_addr; + int mem_map; + int cpu; + int irq; + int ret; + + irq = irq_alloc_hwirq(-1); + if (!irq) + return -ENOSPC; + + /* + * Since we use a 64-bit Mem-Map to accept the MSI write, we fail + * devices that are not capable of generating a 64-bit message address. + * These devices will fall back to using the legacy interrupts. + * Most PCIe endpoint devices do support 64-bit message addressing. + */ + if (desc->msi_attrib.is_64 == 0) { + dev_printk(KERN_INFO, &pdev->dev, + "64-bit MSI message address not supported, " + "falling back to legacy interrupts.\n"); + + ret = -ENOMEM; + goto is_64_failure; + } + + default_irq = desc->msi_attrib.default_irq; + controller = irq_get_handler_data(default_irq); + + BUG_ON(!controller); + + trio_context = controller->trio; + + /* + * Allocate a scatter-queue that will accept the MSI write and + * trigger the TILE-side interrupts. We use the scatter-queue regions + * before the mem map regions, because the latter are needed by more + * applications. + */ + mem_map = gxio_trio_alloc_scatter_queues(trio_context, 1, 0, 0); + if (mem_map >= 0) { + TRIO_MAP_SQ_DOORBELL_FMT_t doorbell_template = {{ + .pop = 0, + .doorbell = 1, + }}; + + mem_map += TRIO_NUM_MAP_MEM_REGIONS; + mem_map_base = MEM_MAP_INTR_REGIONS_BASE + + mem_map * MEM_MAP_INTR_REGION_SIZE; + mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1; + + msi_addr = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 8; + msg.data = (unsigned int)doorbell_template.word; + } else { + /* SQ regions are out, allocate from map mem regions. */ + mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0); + if (mem_map < 0) { + dev_printk(KERN_INFO, &pdev->dev, + "%s Mem-Map alloc failure. " + "Failed to initialize MSI interrupts. " + "Falling back to legacy interrupts.\n", + desc->msi_attrib.is_msix ? "MSI-X" : "MSI"); + ret = -ENOMEM; + goto msi_mem_map_alloc_failure; + } + + mem_map_base = MEM_MAP_INTR_REGIONS_BASE + + mem_map * MEM_MAP_INTR_REGION_SIZE; + mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1; + + msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 - + TRIO_MAP_MEM_REG_INT0; + + msg.data = mem_map; + } + + /* We try to distribute different IRQs to different tiles. */ + cpu = tile_irq_cpu(irq); + + /* + * Now call up to the HV to configure the MSI interrupt and + * set up the IPI binding. + */ + ret = gxio_trio_config_msi_intr(trio_context, cpu_x(cpu), cpu_y(cpu), + KERNEL_PL, irq, controller->mac, + mem_map, mem_map_base, mem_map_limit, + trio_context->asid); + if (ret < 0) { + dev_printk(KERN_INFO, &pdev->dev, "HV MSI config failed.\n"); + + goto hv_msi_config_failure; + } + + irq_set_msi_desc(irq, desc); + + msg.address_hi = msi_addr >> 32; + msg.address_lo = msi_addr & 0xffffffff; + + write_msi_msg(irq, &msg); + irq_set_chip_and_handler(irq, &tilegx_msi_chip, handle_level_irq); + irq_set_handler_data(irq, controller); + + return 0; + +hv_msi_config_failure: + /* Free mem-map */ +msi_mem_map_alloc_failure: +is_64_failure: + irq_free_hwirq(irq); + return ret; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + irq_free_hwirq(irq); +} diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c new file mode 100644 index 00000000000..2bf6c9c135c --- /dev/null +++ b/arch/tile/kernel/perf_event.c @@ -0,0 +1,1005 @@ +/* + * Copyright 2014 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * + * Perf_events support for Tile processor. + * + * This code is based upon the x86 perf event + * code, which is: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian + */ + +#include <linux/kprobes.h> +#include <linux/kernel.h> +#include <linux/kdebug.h> +#include <linux/mutex.h> +#include <linux/bitmap.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/perf_event.h> +#include <linux/atomic.h> +#include <asm/traps.h> +#include <asm/stack.h> +#include <asm/pmc.h> +#include <hv/hypervisor.h> + +#define TILE_MAX_COUNTERS 4 + +#define PERF_COUNT_0_IDX 0 +#define PERF_COUNT_1_IDX 1 +#define AUX_PERF_COUNT_0_IDX 2 +#define AUX_PERF_COUNT_1_IDX 3 + +struct cpu_hw_events { + int n_events; + struct perf_event *events[TILE_MAX_COUNTERS]; /* counter order */ + struct perf_event *event_list[TILE_MAX_COUNTERS]; /* enabled + order */ + int assign[TILE_MAX_COUNTERS]; + unsigned long active_mask[BITS_TO_LONGS(TILE_MAX_COUNTERS)]; + unsigned long used_mask; +}; + +/* TILE arch specific performance monitor unit */ +struct tile_pmu { + const char *name; + int version; + const int *hw_events; /* generic hw events table */ + /* generic hw cache events table */ + const int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + int (*map_hw_event)(u64); /*method used to map + hw events */ + int (*map_cache_event)(u64); /*method used to map + cache events */ + + u64 max_period; /* max sampling period */ + u64 cntval_mask; /* counter width mask */ + int cntval_bits; /* counter width */ + int max_events; /* max generic hw events + in map */ + int num_counters; /* number base + aux counters */ + int num_base_counters; /* number base counters */ +}; + +DEFINE_PER_CPU(u64, perf_irqs); +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +#define TILE_OP_UNSUPP (-1) + +#ifndef __tilegx__ +/* TILEPro hardware events map */ +static const int tile_hw_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 0x01, /* ONE */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x06, /* MP_BUNDLE_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = TILE_OP_UNSUPP, + [PERF_COUNT_HW_CACHE_MISSES] = TILE_OP_UNSUPP, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x16, /* + MP_CONDITIONAL_BRANCH_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x14, /* + MP_CONDITIONAL_BRANCH_MISSPREDICT */ + [PERF_COUNT_HW_BUS_CYCLES] = TILE_OP_UNSUPP, +}; +#else +/* TILEGx hardware events map */ +static const int tile_hw_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 0x181, /* ONE */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0xdb, /* INSTRUCTION_BUNDLE */ + [PERF_COUNT_HW_CACHE_REFERENCES] = TILE_OP_UNSUPP, + [PERF_COUNT_HW_CACHE_MISSES] = TILE_OP_UNSUPP, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0xd9, /* + COND_BRANCH_PRED_CORRECT */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0xda, /* + COND_BRANCH_PRED_INCORRECT */ + [PERF_COUNT_HW_BUS_CYCLES] = TILE_OP_UNSUPP, +}; +#endif + +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of -1 means + * 'not supported', any other value means the + * raw hw_event ID. + */ +#ifndef __tilegx__ +/* TILEPro hardware cache event map */ +static const int tile_cache_event_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = 0x21, /* RD_MISS */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = 0x22, /* WR_MISS */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x12, /* MP_ICACHE_HIT_ISSUED */ + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x1d, /* TLB_CNT */ + [C(RESULT_MISS)] = 0x20, /* TLB_EXCEPTION */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x13, /* MP_ITLB_HIT_ISSUED */ + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +}; +#else +/* TILEGx hardware events map */ +static const int tile_cache_event_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + /* + * Like some other architectures (e.g. ARM), the performance + * counters don't differentiate between read and write + * accesses/misses, so this isn't strictly correct, but it's the + * best we can do. Writes and reads get combined. + */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = 0x44, /* RD_MISS */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = 0x45, /* WR_MISS */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x40, /* TLB_CNT */ + [C(RESULT_MISS)] = 0x43, /* TLB_EXCEPTION */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x40, /* TLB_CNT */ + [C(RESULT_MISS)] = 0x43, /* TLB_EXCEPTION */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = 0xd4, /* ITLB_MISS_INT */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = 0xd4, /* ITLB_MISS_INT */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, + [C(RESULT_MISS)] = TILE_OP_UNSUPP, + }, +}, +}; +#endif + +static atomic_t tile_active_events; +static DEFINE_MUTEX(perf_intr_reserve_mutex); + +static int tile_map_hw_event(u64 config); +static int tile_map_cache_event(u64 config); + +static int tile_pmu_handle_irq(struct pt_regs *regs, int fault); + +/* + * To avoid new_raw_count getting larger then pre_raw_count + * in tile_perf_event_update(), we limit the value of max_period to 2^31 - 1. + */ +static const struct tile_pmu tilepmu = { +#ifndef __tilegx__ + .name = "tilepro", +#else + .name = "tilegx", +#endif + .max_events = ARRAY_SIZE(tile_hw_event_map), + .map_hw_event = tile_map_hw_event, + .hw_events = tile_hw_event_map, + .map_cache_event = tile_map_cache_event, + .cache_events = &tile_cache_event_map, + .cntval_bits = 32, + .cntval_mask = (1ULL << 32) - 1, + .max_period = (1ULL << 31) - 1, + .num_counters = TILE_MAX_COUNTERS, + .num_base_counters = TILE_BASE_COUNTERS, +}; + +static const struct tile_pmu *tile_pmu __read_mostly; + +/* + * Check whether perf event is enabled. + */ +int tile_perf_enabled(void) +{ + return atomic_read(&tile_active_events) != 0; +} + +/* + * Read Performance Counters. + */ +static inline u64 read_counter(int idx) +{ + u64 val = 0; + + /* __insn_mfspr() only takes an immediate argument */ + switch (idx) { + case PERF_COUNT_0_IDX: + val = __insn_mfspr(SPR_PERF_COUNT_0); + break; + case PERF_COUNT_1_IDX: + val = __insn_mfspr(SPR_PERF_COUNT_1); + break; + case AUX_PERF_COUNT_0_IDX: + val = __insn_mfspr(SPR_AUX_PERF_COUNT_0); + break; + case AUX_PERF_COUNT_1_IDX: + val = __insn_mfspr(SPR_AUX_PERF_COUNT_1); + break; + default: + WARN_ON_ONCE(idx > AUX_PERF_COUNT_1_IDX || + idx < PERF_COUNT_0_IDX); + } + + return val; +} + +/* + * Write Performance Counters. + */ +static inline void write_counter(int idx, u64 value) +{ + /* __insn_mtspr() only takes an immediate argument */ + switch (idx) { + case PERF_COUNT_0_IDX: + __insn_mtspr(SPR_PERF_COUNT_0, value); + break; + case PERF_COUNT_1_IDX: + __insn_mtspr(SPR_PERF_COUNT_1, value); + break; + case AUX_PERF_COUNT_0_IDX: + __insn_mtspr(SPR_AUX_PERF_COUNT_0, value); + break; + case AUX_PERF_COUNT_1_IDX: + __insn_mtspr(SPR_AUX_PERF_COUNT_1, value); + break; + default: + WARN_ON_ONCE(idx > AUX_PERF_COUNT_1_IDX || + idx < PERF_COUNT_0_IDX); + } +} + +/* + * Enable performance event by setting + * Performance Counter Control registers. + */ +static inline void tile_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned long cfg, mask; + int shift, idx = hwc->idx; + + /* + * prevent early activation from tile_pmu_start() in hw_perf_enable + */ + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (idx < tile_pmu->num_base_counters) + cfg = __insn_mfspr(SPR_PERF_COUNT_CTL); + else + cfg = __insn_mfspr(SPR_AUX_PERF_COUNT_CTL); + + switch (idx) { + case PERF_COUNT_0_IDX: + case AUX_PERF_COUNT_0_IDX: + mask = TILE_EVENT_MASK; + shift = 0; + break; + case PERF_COUNT_1_IDX: + case AUX_PERF_COUNT_1_IDX: + mask = TILE_EVENT_MASK << 16; + shift = 16; + break; + default: + WARN_ON_ONCE(idx < PERF_COUNT_0_IDX || + idx > AUX_PERF_COUNT_1_IDX); + return; + } + + /* Clear mask bits to enable the event. */ + cfg &= ~mask; + cfg |= hwc->config << shift; + + if (idx < tile_pmu->num_base_counters) + __insn_mtspr(SPR_PERF_COUNT_CTL, cfg); + else + __insn_mtspr(SPR_AUX_PERF_COUNT_CTL, cfg); +} + +/* + * Disable performance event by clearing + * Performance Counter Control registers. + */ +static inline void tile_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned long cfg, mask; + int idx = hwc->idx; + + if (idx == -1) + return; + + if (idx < tile_pmu->num_base_counters) + cfg = __insn_mfspr(SPR_PERF_COUNT_CTL); + else + cfg = __insn_mfspr(SPR_AUX_PERF_COUNT_CTL); + + switch (idx) { + case PERF_COUNT_0_IDX: + case AUX_PERF_COUNT_0_IDX: + mask = TILE_PLM_MASK; + break; + case PERF_COUNT_1_IDX: + case AUX_PERF_COUNT_1_IDX: + mask = TILE_PLM_MASK << 16; + break; + default: + WARN_ON_ONCE(idx < PERF_COUNT_0_IDX || + idx > AUX_PERF_COUNT_1_IDX); + return; + } + + /* Set mask bits to disable the event. */ + cfg |= mask; + + if (idx < tile_pmu->num_base_counters) + __insn_mtspr(SPR_PERF_COUNT_CTL, cfg); + else + __insn_mtspr(SPR_AUX_PERF_COUNT_CTL, cfg); +} + +/* + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. + * Returns the delta events processed. + */ +static u64 tile_perf_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - tile_pmu->cntval_bits; + u64 prev_raw_count, new_raw_count; + u64 oldval; + int idx = hwc->idx; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = read_counter(idx); + + oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count); + if (oldval != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +static int tile_event_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + if (left > tile_pmu->max_period) + left = tile_pmu->max_period; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + local64_set(&hwc->prev_count, (u64)-left); + + write_counter(idx, (u64)(-left) & tile_pmu->cntval_mask); + + perf_event_update_userpage(event); + + return ret; +} + +/* + * Stop the event but do not release the PMU counter + */ +static void tile_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (__test_and_clear_bit(idx, cpuc->active_mask)) { + tile_pmu_disable_event(event); + cpuc->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + tile_perf_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +/* + * Start an event (without re-assigning counter) + */ +static void tile_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + tile_event_set_period(event); + } + + event->hw.state = 0; + + cpuc->events[idx] = event; + __set_bit(idx, cpuc->active_mask); + + unmask_pmc_interrupts(); + + tile_pmu_enable_event(event); + + perf_event_update_userpage(event); +} + +/* + * Add a single event to the PMU. + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + */ +static int tile_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc; + unsigned long mask; + int b, max_cnt; + + hwc = &event->hw; + + /* + * We are full. + */ + if (cpuc->n_events == tile_pmu->num_counters) + return -ENOSPC; + + cpuc->event_list[cpuc->n_events] = event; + cpuc->n_events++; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + /* + * Find first empty counter. + */ + max_cnt = tile_pmu->num_counters; + mask = ~cpuc->used_mask; + + /* Find next free counter. */ + b = find_next_bit(&mask, max_cnt, 0); + + /* Should not happen. */ + if (WARN_ON_ONCE(b == max_cnt)) + return -ENOSPC; + + /* + * Assign counter to event. + */ + event->hw.idx = b; + __set_bit(b, &cpuc->used_mask); + + /* + * Start if requested. + */ + if (flags & PERF_EF_START) + tile_pmu_start(event, PERF_EF_RELOAD); + + return 0; +} + +/* + * Delete a single event from the PMU. + * + * The event is deleted from the group of enabled events. + * If it is the last event, disable PMU interrupt. + */ +static void tile_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; + + /* + * Remove event from list, compact list if necessary. + */ + for (i = 0; i < cpuc->n_events; i++) { + if (cpuc->event_list[i] == event) { + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + --cpuc->n_events; + cpuc->events[event->hw.idx] = NULL; + __clear_bit(event->hw.idx, &cpuc->used_mask); + tile_pmu_stop(event, PERF_EF_UPDATE); + break; + } + } + /* + * If there are no events left, then mask PMU interrupt. + */ + if (cpuc->n_events == 0) + mask_pmc_interrupts(); + perf_event_update_userpage(event); +} + +/* + * Propagate event elapsed time into the event. + */ +static inline void tile_pmu_read(struct perf_event *event) +{ + tile_perf_event_update(event); +} + +/* + * Map generic events to Tile PMU. + */ +static int tile_map_hw_event(u64 config) +{ + if (config >= tile_pmu->max_events) + return -EINVAL; + return tile_pmu->hw_events[config]; +} + +/* + * Map generic hardware cache events to Tile PMU. + */ +static int tile_map_cache_event(u64 config) +{ + unsigned int cache_type, cache_op, cache_result; + int code; + + if (!tile_pmu->cache_events) + return -ENOENT; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + code = (*tile_pmu->cache_events)[cache_type][cache_op][cache_result]; + if (code == TILE_OP_UNSUPP) + return -EINVAL; + + return code; +} + +static void tile_event_destroy(struct perf_event *event) +{ + if (atomic_dec_return(&tile_active_events) == 0) + release_pmc_hardware(); +} + +static int __tile_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + int code; + + switch (attr->type) { + case PERF_TYPE_HARDWARE: + code = tile_pmu->map_hw_event(attr->config); + break; + case PERF_TYPE_HW_CACHE: + code = tile_pmu->map_cache_event(attr->config); + break; + case PERF_TYPE_RAW: + code = attr->config & TILE_EVENT_MASK; + break; + default: + /* Should not happen. */ + return -EOPNOTSUPP; + } + + if (code < 0) + return code; + + hwc->config = code; + hwc->idx = -1; + + if (attr->exclude_user) + hwc->config |= TILE_CTL_EXCL_USER; + + if (attr->exclude_kernel) + hwc->config |= TILE_CTL_EXCL_KERNEL; + + if (attr->exclude_hv) + hwc->config |= TILE_CTL_EXCL_HV; + + if (!hwc->sample_period) { + hwc->sample_period = tile_pmu->max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } + event->destroy = tile_event_destroy; + return 0; +} + +static int tile_event_init(struct perf_event *event) +{ + int err = 0; + perf_irq_t old_irq_handler = NULL; + + if (atomic_inc_return(&tile_active_events) == 1) + old_irq_handler = reserve_pmc_hardware(tile_pmu_handle_irq); + + if (old_irq_handler) { + pr_warn("PMC hardware busy (reserved by oprofile)\n"); + + atomic_dec(&tile_active_events); + return -EBUSY; + } + + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + err = __tile_event_init(event); + if (err) { + if (event->destroy) + event->destroy(event); + } + return err; +} + +static struct pmu tilera_pmu = { + .event_init = tile_event_init, + .add = tile_pmu_add, + .del = tile_pmu_del, + + .start = tile_pmu_start, + .stop = tile_pmu_stop, + + .read = tile_pmu_read, +}; + +/* + * PMU's IRQ handler, PMU has 2 interrupts, they share the same handler. + */ +int tile_pmu_handle_irq(struct pt_regs *regs, int fault) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + u64 val; + unsigned long status; + int bit; + + __get_cpu_var(perf_irqs)++; + + if (!atomic_read(&tile_active_events)) + return 0; + + status = pmc_get_overflow(); + pmc_ack_overflow(status); + + for_each_set_bit(bit, &status, tile_pmu->num_counters) { + + event = cpuc->events[bit]; + + if (!event) + continue; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + hwc = &event->hw; + + val = tile_perf_event_update(event); + if (val & (1ULL << (tile_pmu->cntval_bits - 1))) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + if (!tile_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + tile_pmu_stop(event, 0); + } + + return 0; +} + +static bool __init supported_pmu(void) +{ + tile_pmu = &tilepmu; + return true; +} + +int __init init_hw_perf_events(void) +{ + supported_pmu(); + perf_pmu_register(&tilera_pmu, "cpu", PERF_TYPE_RAW); + return 0; +} +arch_initcall(init_hw_perf_events); + +/* Callchain handling code. */ + +/* + * Tile specific backtracing code for perf_events. + */ +static inline void perf_callchain(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ + struct KBacktraceIterator kbt; + unsigned int i; + + /* + * Get the address just after the "jalr" instruction that + * jumps to the handler for a syscall. When we find this + * address in a backtrace, we silently ignore it, which gives + * us a one-step backtrace connection from the sys_xxx() + * function in the kernel to the xxx() function in libc. + * Otherwise, we lose the ability to properly attribute time + * from the libc calls to the kernel implementations, since + * oprofile only considers PCs from backtraces a pair at a time. + */ + unsigned long handle_syscall_pc = handle_syscall_link_address(); + + KBacktraceIterator_init(&kbt, NULL, regs); + kbt.profile = 1; + + /* + * The sample for the pc is already recorded. Now we are adding the + * address of the callsites on the stack. Our iterator starts + * with the frame of the (already sampled) call site. If our + * iterator contained a "return address" field, we could have just + * used it and wouldn't have needed to skip the first + * frame. That's in effect what the arm and x86 versions do. + * Instead we peel off the first iteration to get the equivalent + * behavior. + */ + + if (KBacktraceIterator_end(&kbt)) + return; + KBacktraceIterator_next(&kbt); + + /* + * Set stack depth to 16 for user and kernel space respectively, that + * is, total 32 stack frames. + */ + for (i = 0; i < 16; ++i) { + unsigned long pc; + if (KBacktraceIterator_end(&kbt)) + break; + pc = kbt.it.pc; + if (pc != handle_syscall_pc) + perf_callchain_store(entry, pc); + KBacktraceIterator_next(&kbt); + } +} + +void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ + perf_callchain(entry, regs); +} + +void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ + perf_callchain(entry, regs); +} diff --git a/arch/tile/kernel/pmc.c b/arch/tile/kernel/pmc.c new file mode 100644 index 00000000000..db62cc34b95 --- /dev/null +++ b/arch/tile/kernel/pmc.c @@ -0,0 +1,121 @@ +/* + * Copyright 2014 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/atomic.h> +#include <linux/interrupt.h> + +#include <asm/processor.h> +#include <asm/pmc.h> + +perf_irq_t perf_irq = NULL; +int handle_perf_interrupt(struct pt_regs *regs, int fault) +{ + int retval; + + if (!perf_irq) + panic("Unexpected PERF_COUNT interrupt %d\n", fault); + + nmi_enter(); + retval = perf_irq(regs, fault); + nmi_exit(); + return retval; +} + +/* Reserve PMC hardware if it is available. */ +perf_irq_t reserve_pmc_hardware(perf_irq_t new_perf_irq) +{ + return cmpxchg(&perf_irq, NULL, new_perf_irq); +} +EXPORT_SYMBOL(reserve_pmc_hardware); + +/* Release PMC hardware. */ +void release_pmc_hardware(void) +{ + perf_irq = NULL; +} +EXPORT_SYMBOL(release_pmc_hardware); + + +/* + * Get current overflow status of each performance counter, + * and auxiliary performance counter. + */ +unsigned long +pmc_get_overflow(void) +{ + unsigned long status; + + /* + * merge base+aux into a single vector + */ + status = __insn_mfspr(SPR_PERF_COUNT_STS); + status |= __insn_mfspr(SPR_AUX_PERF_COUNT_STS) << TILE_BASE_COUNTERS; + return status; +} + +/* + * Clear the status bit for the corresponding counter, if written + * with a one. + */ +void +pmc_ack_overflow(unsigned long status) +{ + /* + * clear overflow status by writing ones + */ + __insn_mtspr(SPR_PERF_COUNT_STS, status); + __insn_mtspr(SPR_AUX_PERF_COUNT_STS, status >> TILE_BASE_COUNTERS); +} + +/* + * The perf count interrupts are masked and unmasked explicitly, + * and only here. The normal irq_enable() does not enable them, + * and irq_disable() does not disable them. That lets these + * routines drive the perf count interrupts orthogonally. + * + * We also mask the perf count interrupts on entry to the perf count + * interrupt handler in assembly code, and by default unmask them + * again (with interrupt critical section protection) just before + * returning from the interrupt. If the perf count handler returns + * a non-zero error code, then we don't re-enable them before returning. + * + * For Pro, we rely on both interrupts being in the same word to update + * them atomically so we never have one enabled and one disabled. + */ + +#if CHIP_HAS_SPLIT_INTR_MASK() +# if INT_PERF_COUNT < 32 || INT_AUX_PERF_COUNT < 32 +# error Fix assumptions about which word PERF_COUNT interrupts are in +# endif +#endif + +static inline unsigned long long pmc_mask(void) +{ + unsigned long long mask = 1ULL << INT_PERF_COUNT; + mask |= 1ULL << INT_AUX_PERF_COUNT; + return mask; +} + +void unmask_pmc_interrupts(void) +{ + interrupt_mask_reset_mask(pmc_mask()); +} + +void mask_pmc_interrupts(void) +{ + interrupt_mask_set_mask(pmc_mask()); +} diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c index 62d820833c6..6829a950864 100644 --- a/arch/tile/kernel/proc.c +++ b/arch/tile/kernel/proc.c @@ -22,7 +22,9 @@ #include <linux/proc_fs.h> #include <linux/sysctl.h> #include <linux/hardirq.h> +#include <linux/hugetlb.h> #include <linux/mman.h> +#include <asm/unaligned.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/sections.h> @@ -111,8 +113,7 @@ arch_initcall(proc_tile_init); * Support /proc/sys/tile directory */ -#ifndef __tilegx__ /* FIXME: GX: no support for unaligned access yet */ -static ctl_table unaligned_subtable[] = { +static struct ctl_table unaligned_subtable[] = { { .procname = "enabled", .data = &unaligned_fixup, @@ -137,7 +138,7 @@ static ctl_table unaligned_subtable[] = { {} }; -static ctl_table unaligned_table[] = { +static struct ctl_table unaligned_table[] = { { .procname = "unaligned_fixup", .mode = 0555, @@ -145,7 +146,6 @@ static ctl_table unaligned_table[] = { }, {} }; -#endif static struct ctl_path tile_path[] = { { .procname = "tile" }, @@ -154,9 +154,7 @@ static struct ctl_path tile_path[] = { static int __init proc_sys_tile_init(void) { -#ifndef __tilegx__ /* FIXME: GX: no support for unaligned access yet */ register_sysctl_paths(tile_path, unaligned_table); -#endif return 0; } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 4c1ac6e5347..16ed5894875 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -27,24 +27,25 @@ #include <linux/kernel.h> #include <linux/tracehook.h> #include <linux/signal.h> -#include <asm/system.h> #include <asm/stack.h> +#include <asm/switch_to.h> #include <asm/homecache.h> #include <asm/syscalls.h> #include <asm/traps.h> +#include <asm/setup.h> +#include <asm/uaccess.h> #ifdef CONFIG_HARDWALL #include <asm/hardwall.h> #endif #include <arch/chip.h> #include <arch/abi.h> - +#include <arch/sim_def.h> /* * Use the (x86) "idle=poll" option to prefer low latency when leaving the * idle loop over low power while in the idle loop, e.g. if we have * one thread per core and we want to get threads out of futex waits fast. */ -static int no_idle_nap; static int __init idle_setup(char *str) { if (!str) @@ -52,105 +53,28 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { pr_info("using polling idle threads.\n"); - no_idle_nap = 1; - } else if (!strcmp(str, "halt")) - no_idle_nap = 0; - else - return -1; - - return 0; -} -early_param("idle", idle_setup); - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - int cpu = smp_processor_id(); - - - current_thread_info()->status |= TS_POLLING; - - if (no_idle_nap) { - while (1) { - while (!need_resched()) - cpu_relax(); - schedule(); - } - } - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); - while (!need_resched()) { - if (cpu_is_offline(cpu)) - BUG(); /* no HOTPLUG_CPU */ - - local_irq_disable(); - __get_cpu_var(irq_stat).idle_timestamp = jiffies; - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - - if (!need_resched()) - _cpu_idle(); - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - } - rcu_idle_exit(); - tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + cpu_idle_poll_ctrl(true); + return 0; + } else if (!strcmp(str, "halt")) { + return 0; } + return -1; } +early_param("idle", idle_setup); -struct thread_info *alloc_thread_info_node(struct task_struct *task, int node) +void arch_cpu_idle(void) { - struct page *page; - gfp_t flags = GFP_KERNEL; - -#ifdef CONFIG_DEBUG_STACK_USAGE - flags |= __GFP_ZERO; -#endif - - page = alloc_pages_node(node, flags, THREAD_SIZE_ORDER); - if (!page) - return NULL; - - return (struct thread_info *)page_address(page); + __get_cpu_var(irq_stat).idle_timestamp = jiffies; + _cpu_idle(); } /* - * Free a thread_info node, and all of its derivative - * data structures. + * Release a thread_info structure */ -void free_thread_info(struct thread_info *info) +void arch_release_thread_info(struct thread_info *info) { struct single_step_state *step_state = info->step_state; -#ifdef CONFIG_HARDWALL - /* - * We free a thread_info from the context of the task that has - * been scheduled next, so the original task is already dead. - * Calling deactivate here just frees up the data structures. - * If the task we're freeing held the last reference to a - * hardwall fd, it would have been released prior to this point - * anyway via exit_files(), and "hardwall" would be NULL by now. - */ - if (info->task->thread.hardwall) - hardwall_deactivate(info->task); -#endif - if (step_state) { /* @@ -169,25 +93,54 @@ void free_thread_info(struct thread_info *info) */ kfree(step_state); } - - free_pages((unsigned long)info, THREAD_SIZE_ORDER); } static void save_arch_state(struct thread_struct *t); int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long stack_size, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { - struct pt_regs *childregs; + struct pt_regs *childregs = task_pt_regs(p); unsigned long ksp; + unsigned long *callee_regs; + + /* + * Set up the stack and stack pointer appropriately for the + * new child to find itself woken up in __switch_to(). + * The callee-saved registers must be on the stack to be read; + * the new task will then jump to assembly support to handle + * calling schedule_tail(), etc., and (for userspace tasks) + * returning to the context set up in the pt_regs. + */ + ksp = (unsigned long) childregs; + ksp -= C_ABI_SAVE_AREA_SIZE; /* interrupt-entry save area */ + ((long *)ksp)[0] = ((long *)ksp)[1] = 0; + ksp -= CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long); + callee_regs = (unsigned long *)ksp; + ksp -= C_ABI_SAVE_AREA_SIZE; /* __switch_to() save area */ + ((long *)ksp)[0] = ((long *)ksp)[1] = 0; + p->thread.ksp = ksp; + + /* Record the pid of the task that created this one. */ + p->thread.creator_pid = current->pid; + + if (unlikely(p->flags & PF_KTHREAD)) { + /* kernel thread */ + memset(childregs, 0, sizeof(struct pt_regs)); + memset(&callee_regs[2], 0, + (CALLEE_SAVED_REGS_COUNT - 2) * sizeof(unsigned long)); + callee_regs[0] = sp; /* r30 = function */ + callee_regs[1] = arg; /* r31 = arg */ + childregs->ex1 = PL_ICS_EX1(KERNEL_PL, 0); + p->thread.pc = (unsigned long) ret_from_kernel_thread; + return 0; + } /* - * When creating a new kernel thread we pass sp as zero. - * Assign it to a reasonable value now that we have the stack. + * Start new thread in ret_from_fork so it schedules properly + * and then return from interrupt like the parent. */ - if (sp == 0 && regs->ex1 == PL_ICS_EX1(KERNEL_PL, 0)) - sp = KSTK_TOP(p); + p->thread.pc = (unsigned long) ret_from_fork; /* * Do not clone step state from the parent; each thread @@ -195,52 +148,35 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ task_thread_info(p)->step_state = NULL; +#ifdef __tilegx__ /* - * Start new thread in ret_from_fork so it schedules properly - * and then return from interrupt like the parent. + * Do not clone unalign jit fixup from the parent; each thread + * must allocate its own on demand. */ - p->thread.pc = (unsigned long) ret_from_fork; - - /* Save user stack top pointer so we can ID the stack vm area later. */ - p->thread.usp0 = sp; - - /* Record the pid of the process that created this one. */ - p->thread.creator_pid = current->pid; + task_thread_info(p)->unalign_jit_base = NULL; +#endif /* * Copy the registers onto the kernel stack so the * return-from-interrupt code will reload it into registers. */ - childregs = task_pt_regs(p); - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->regs[0] = 0; /* return value is zero */ - childregs->sp = sp; /* override with new user stack pointer */ + if (sp) + childregs->sp = sp; /* override with new user stack pointer */ + memcpy(callee_regs, &childregs->regs[CALLEE_SAVED_FIRST_REG], + CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long)); + + /* Save user stack top pointer so we can ID the stack vm area later. */ + p->thread.usp0 = childregs->sp; /* * If CLONE_SETTLS is set, set "tp" in the new task to "r4", * which is passed in as arg #5 to sys_clone(). */ if (clone_flags & CLONE_SETTLS) - childregs->tp = regs->regs[4]; + childregs->tp = childregs->regs[4]; - /* - * Copy the callee-saved registers from the passed pt_regs struct - * into the context-switch callee-saved registers area. - * This way when we start the interrupt-return sequence, the - * callee-save registers will be correctly in registers, which - * is how we assume the compiler leaves them as we start doing - * the normal return-from-interrupt path after calling C code. - * Zero out the C ABI save area to mark the top of the stack. - */ - ksp = (unsigned long) childregs; - ksp -= C_ABI_SAVE_AREA_SIZE; /* interrupt-entry save area */ - ((long *)ksp)[0] = ((long *)ksp)[1] = 0; - ksp -= CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long); - memcpy((void *)ksp, ®s->regs[CALLEE_SAVED_FIRST_REG], - CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long)); - ksp -= C_ABI_SAVE_AREA_SIZE; /* __switch_to() save area */ - ((long *)ksp)[0] = ((long *)ksp)[1] = 0; - p->thread.ksp = ksp; #if CHIP_HAS_TILE_DMA() /* @@ -251,20 +187,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(&p->thread.dma_async_tlb, 0, sizeof(struct async_tlb)); #endif -#if CHIP_HAS_SN_PROC() - /* Likewise, the new thread is not running static processor code. */ - p->thread.sn_proc_running = 0; - memset(&p->thread.sn_async_tlb, 0, sizeof(struct async_tlb)); -#endif - -#if CHIP_HAS_PROC_STATUS_SPR() /* New thread has its miscellaneous processor state bits clear. */ p->thread.proc_status = 0; -#endif #ifdef CONFIG_HARDWALL /* New thread does not own any networks. */ - p->thread.hardwall = NULL; + memset(&p->thread.hardwall[0], 0, + sizeof(struct hardwall_task) * HARDWALL_TYPES); #endif @@ -277,19 +206,32 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, return 0; } +int set_unalign_ctl(struct task_struct *tsk, unsigned int val) +{ + task_thread_info(tsk)->align_ctl = val; + return 0; +} + +int get_unalign_ctl(struct task_struct *tsk, unsigned long adr) +{ + return put_user(task_thread_info(tsk)->align_ctl, + (unsigned int __user *)adr); +} + +static struct task_struct corrupt_current = { .comm = "<corrupt>" }; + /* * Return "current" if it looks plausible, or else a pointer to a dummy. * This can be helpful if we are just trying to emit a clean panic. */ struct task_struct *validate_current(void) { - static struct task_struct corrupt = { .comm = "<corrupt>" }; struct task_struct *tsk = current; if (unlikely((unsigned long)tsk < PAGE_OFFSET || - (void *)tsk > high_memory || + (high_memory && (void *)tsk > high_memory) || ((unsigned long)tsk & (__alignof__(*tsk) - 1)) != 0)) { pr_err("Corrupt 'current' %p (sp %#lx)\n", tsk, stack_pointer); - tsk = &corrupt; + tsk = &corrupt_current; } return tsk; } @@ -428,15 +370,11 @@ static void save_arch_state(struct thread_struct *t) t->system_save[2] = __insn_mfspr(SPR_SYSTEM_SAVE_0_2); t->system_save[3] = __insn_mfspr(SPR_SYSTEM_SAVE_0_3); t->intctrl_0 = __insn_mfspr(SPR_INTCTRL_0_STATUS); -#if CHIP_HAS_PROC_STATUS_SPR() t->proc_status = __insn_mfspr(SPR_PROC_STATUS); -#endif #if !CHIP_HAS_FIXED_INTVEC_BASE() t->interrupt_vector_base = __insn_mfspr(SPR_INTERRUPT_VECTOR_BASE_0); #endif -#if CHIP_HAS_TILE_RTF_HWM() t->tile_rtf_hwm = __insn_mfspr(SPR_TILE_RTF_HWM); -#endif #if CHIP_HAS_DSTREAM_PF() t->dstream_pf = __insn_mfspr(SPR_DSTREAM_PF); #endif @@ -457,15 +395,11 @@ static void restore_arch_state(const struct thread_struct *t) __insn_mtspr(SPR_SYSTEM_SAVE_0_2, t->system_save[2]); __insn_mtspr(SPR_SYSTEM_SAVE_0_3, t->system_save[3]); __insn_mtspr(SPR_INTCTRL_0_STATUS, t->intctrl_0); -#if CHIP_HAS_PROC_STATUS_SPR() __insn_mtspr(SPR_PROC_STATUS, t->proc_status); -#endif #if !CHIP_HAS_FIXED_INTVEC_BASE() __insn_mtspr(SPR_INTERRUPT_VECTOR_BASE_0, t->interrupt_vector_base); #endif -#if CHIP_HAS_TILE_RTF_HWM() __insn_mtspr(SPR_TILE_RTF_HWM, t->tile_rtf_hwm); -#endif #if CHIP_HAS_DSTREAM_PF() __insn_mtspr(SPR_DSTREAM_PF, t->dstream_pf); #endif @@ -474,26 +408,11 @@ static void restore_arch_state(const struct thread_struct *t) void _prepare_arch_switch(struct task_struct *next) { -#if CHIP_HAS_SN_PROC() - int snctl; -#endif #if CHIP_HAS_TILE_DMA() struct tile_dma_state *dma = ¤t->thread.tile_dma_state; if (dma->enabled) save_tile_dma_state(dma); #endif -#if CHIP_HAS_SN_PROC() - /* - * Suspend the static network processor if it was running. - * We do not suspend the fabric itself, just like we don't - * try to suspend the UDN. - */ - snctl = __insn_mfspr(SPR_SNCTL); - current->thread.sn_proc_running = - (snctl & SPR_SNCTL__FRZPROC_MASK) == 0; - if (current->thread.sn_proc_running) - __insn_mtspr(SPR_SNCTL, snctl | SPR_SNCTL__FRZPROC_MASK); -#endif } @@ -521,25 +440,9 @@ struct task_struct *__sched _switch_to(struct task_struct *prev, /* Restore other arch state. */ restore_arch_state(&next->thread); -#if CHIP_HAS_SN_PROC() - /* - * Restart static network processor in the new process - * if it was running before. - */ - if (next->thread.sn_proc_running) { - int snctl = __insn_mfspr(SPR_SNCTL); - __insn_mtspr(SPR_SNCTL, snctl & ~SPR_SNCTL__FRZPROC_MASK); - } -#endif - #ifdef CONFIG_HARDWALL /* Enable or disable access to the network registers appropriately. */ - if (prev->thread.hardwall != NULL) { - if (next->thread.hardwall == NULL) - restrict_network_mpls(); - } else if (next->thread.hardwall != NULL) { - grant_network_mpls(); - } + hardwall_switch_tasks(prev, next); #endif /* @@ -567,11 +470,18 @@ struct task_struct *__sched _switch_to(struct task_struct *prev, */ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) { + /* If we enter in kernel mode, do nothing and exit the caller loop. */ + if (!user_mode(regs)) + return 0; + + /* Enable interrupts; they are disabled again on return to caller. */ + local_irq_enable(); + if (thread_info_flags & _TIF_NEED_RESCHED) { schedule(); return 1; } -#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() +#if CHIP_HAS_TILE_DMA() if (thread_info_flags & _TIF_ASYNC_TLB) { do_async_page_fault(regs); return 1; @@ -584,74 +494,15 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); return 1; } if (thread_info_flags & _TIF_SINGLESTEP) { - if ((regs->ex1 & SPR_EX_CONTEXT_1_1__PL_MASK) == 0) - single_step_once(regs); + single_step_once(regs); return 0; } panic("work_pending: bad flags %#x\n", thread_info_flags); } -/* Note there is an implicit fifth argument if (clone_flags & CLONE_SETTLS). */ -SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, - void __user *, parent_tidptr, void __user *, child_tidptr, - struct pt_regs *, regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, - parent_tidptr, child_tidptr); -} - -/* - * sys_execve() executes a new program. - */ -SYSCALL_DEFINE4(execve, const char __user *, path, - const char __user *const __user *, argv, - const char __user *const __user *, envp, - struct pt_regs *, regs) -{ - long error; - char *filename; - - filename = getname(path); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, argv, envp, regs); - putname(filename); - if (error == 0) - single_step_execve(); -out: - return error; -} - -#ifdef CONFIG_COMPAT -long compat_sys_execve(const char __user *path, - compat_uptr_t __user *argv, - compat_uptr_t __user *envp, - struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(path); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = compat_do_execve(filename, argv, envp, regs); - putname(filename); - if (error == 0) - single_step_execve(); -out: - return error; -} -#endif - unsigned long get_wchan(struct task_struct *p) { struct KBacktraceIterator kbt; @@ -669,37 +520,6 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -/* - * We pass in lr as zero (cleared in kernel_thread) and the caller - * part of the backtrace ABI on the stack also zeroed (in copy_thread) - * so that backtraces will stop with this function. - * Note that we don't use r0, since copy_thread() clears it. - */ -static void start_kernel_thread(int dummy, int (*fn)(int), int arg) -{ - do_exit(fn(arg)); -} - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - regs.ex1 = PL_ICS_EX1(KERNEL_PL, 0); /* run at kernel PL, no ICS */ - regs.pc = (long) start_kernel_thread; - regs.flags = PT_FLAGS_CALLER_SAVES; /* need to restore r1 and r2 */ - regs.regs[1] = (long) fn; /* function pointer */ - regs.regs[2] = (long) arg; /* parameter register */ - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, - 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - /* Flush thread state. */ void flush_thread(void) { @@ -711,7 +531,15 @@ void flush_thread(void) */ void exit_thread(void) { - /* Nothing */ +#ifdef CONFIG_HARDWALL + /* + * Remove the task from the list of tasks that are associated + * with any live hardwalls. (If the task that is exiting held + * the last reference to a hardwall fd, it would already have + * been released and deactivated at this point.) + */ + hardwall_deactivate_all(current); +#endif } void show_regs(struct pt_regs *regs) @@ -720,24 +548,24 @@ void show_regs(struct pt_regs *regs) int i; pr_err("\n"); - pr_err(" Pid: %d, comm: %20s, CPU: %d\n", - tsk->pid, tsk->comm, smp_processor_id()); + if (tsk != &corrupt_current) + show_regs_print_info(KERN_ERR); #ifdef __tilegx__ - for (i = 0; i < 51; i += 3) + for (i = 0; i < 17; i++) pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n", - i, regs->regs[i], i+1, regs->regs[i+1], - i+2, regs->regs[i+2]); - pr_err(" r51: "REGFMT" r52: "REGFMT" tp : "REGFMT"\n", - regs->regs[51], regs->regs[52], regs->tp); + i, regs->regs[i], i+18, regs->regs[i+18], + i+36, regs->regs[i+36]); + pr_err(" r17: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n", + regs->regs[17], regs->regs[35], regs->tp); pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr); #else - for (i = 0; i < 52; i += 4) + for (i = 0; i < 13; i++) pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT " r%-2d: "REGFMT" r%-2d: "REGFMT"\n", - i, regs->regs[i], i+1, regs->regs[i+1], - i+2, regs->regs[i+2], i+3, regs->regs[i+3]); - pr_err(" r52: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n", - regs->regs[52], regs->tp, regs->sp, regs->lr); + i, regs->regs[i], i+14, regs->regs[i+14], + i+27, regs->regs[i+27], i+40, regs->regs[i+40]); + pr_err(" r13: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n", + regs->regs[13], regs->tp, regs->sp, regs->lr); #endif pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n", regs->pc, regs->ex1, regs->faultnum); diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c index e92e40527d6..de98c6ddf13 100644 --- a/arch/tile/kernel/ptrace.c +++ b/arch/tile/kernel/ptrace.c @@ -19,7 +19,14 @@ #include <linux/kprobes.h> #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/regset.h> +#include <linux/elf.h> +#include <linux/tracehook.h> #include <asm/traps.h> +#include <arch/chip.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> void user_enable_single_step(struct task_struct *child) { @@ -45,6 +52,100 @@ void ptrace_disable(struct task_struct *child) clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); } +/* + * Get registers from task and ready the result for userspace. + * Note that we localize the API issues to getregs() and putregs() at + * some cost in performance, e.g. we need a full pt_regs copy for + * PEEKUSR, and two copies for POKEUSR. But in general we expect + * GETREGS/PUTREGS to be the API of choice anyway. + */ +static char *getregs(struct task_struct *child, struct pt_regs *uregs) +{ + *uregs = *task_pt_regs(child); + + /* Set up flags ABI bits. */ + uregs->flags = 0; +#ifdef CONFIG_COMPAT + if (task_thread_info(child)->status & TS_COMPAT) + uregs->flags |= PT_FLAGS_COMPAT; +#endif + + return (char *)uregs; +} + +/* Put registers back to task. */ +static void putregs(struct task_struct *child, struct pt_regs *uregs) +{ + struct pt_regs *regs = task_pt_regs(child); + + /* Don't allow overwriting the kernel-internal flags word. */ + uregs->flags = regs->flags; + + /* Only allow setting the ICS bit in the ex1 word. */ + uregs->ex1 = PL_ICS_EX1(USER_PL, EX1_ICS(uregs->ex1)); + + *regs = *uregs; +} + +enum tile_regset { + REGSET_GPR, +}; + +static int tile_gpr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct pt_regs regs; + + getregs(target, ®s); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, ®s, 0, + sizeof(regs)); +} + +static int tile_gpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct pt_regs regs; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®s, 0, + sizeof(regs)); + if (ret) + return ret; + + putregs(target, ®s); + + return 0; +} + +static const struct user_regset tile_user_regset[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, + .n = ELF_NGREG, + .size = sizeof(elf_greg_t), + .align = sizeof(elf_greg_t), + .get = tile_gpr_get, + .set = tile_gpr_set, + }, +}; + +static const struct user_regset_view tile_user_regset_view = { + .name = CHIP_ARCH_NAME, + .e_machine = ELF_ARCH, + .ei_osabi = ELF_OSABI, + .regsets = tile_user_regset, + .n = ARRAY_SIZE(tile_user_regset), +}; + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ + return &tile_user_regset_view; +} + long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { @@ -53,14 +154,13 @@ long arch_ptrace(struct task_struct *child, long request, long ret = -EIO; char *childreg; struct pt_regs copyregs; - int ex1_offset; switch (request) { case PTRACE_PEEKUSR: /* Read register from pt_regs. */ if (addr >= PTREGS_SIZE) break; - childreg = (char *)task_pt_regs(child) + addr; + childreg = getregs(child, ©regs) + addr; #ifdef CONFIG_COMPAT if (is_compat_task()) { if (addr & (sizeof(compat_long_t)-1)) @@ -79,17 +179,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_POKEUSR: /* Write register in pt_regs. */ if (addr >= PTREGS_SIZE) break; - childreg = (char *)task_pt_regs(child) + addr; - - /* Guard against overwrites of the privilege level. */ - ex1_offset = PTREGS_OFFSET_EX1; -#if defined(CONFIG_COMPAT) && defined(__BIG_ENDIAN) - if (is_compat_task()) /* point at low word */ - ex1_offset += sizeof(compat_long_t); -#endif - if (addr == ex1_offset) - data = PL_ICS_EX1(USER_PL, EX1_ICS(data)); - + childreg = getregs(child, ©regs) + addr; #ifdef CONFIG_COMPAT if (is_compat_task()) { if (addr & (sizeof(compat_long_t)-1)) @@ -102,24 +192,20 @@ long arch_ptrace(struct task_struct *child, long request, break; *(long *)childreg = data; } + putregs(child, ©regs); ret = 0; break; case PTRACE_GETREGS: /* Get all registers from the child. */ - if (copy_to_user(datap, task_pt_regs(child), - sizeof(struct pt_regs)) == 0) { - ret = 0; - } + ret = copy_regset_to_user(child, &tile_user_regset_view, + REGSET_GPR, 0, + sizeof(struct pt_regs), datap); break; case PTRACE_SETREGS: /* Set all registers in the child. */ - if (copy_from_user(©regs, datap, - sizeof(struct pt_regs)) == 0) { - copyregs.ex1 = - PL_ICS_EX1(USER_PL, EX1_ICS(copyregs.ex1)); - *task_pt_regs(child) = copyregs; - ret = 0; - } + ret = copy_regset_from_user(child, &tile_user_regset_view, + REGSET_GPR, 0, + sizeof(struct pt_regs), datap); break; case PTRACE_GETFPREGS: /* Get the child FPU state. */ @@ -128,12 +214,16 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_SETOPTIONS: /* Support TILE-specific ptrace options. */ - child->ptrace &= ~PT_TRACE_MASK_TILE; + BUILD_BUG_ON(PTRACE_O_MASK_TILE & PTRACE_O_MASK); tmp = data & PTRACE_O_MASK_TILE; data &= ~PTRACE_O_MASK_TILE; ret = ptrace_request(child, request, addr, data); - if (tmp & PTRACE_O_TRACEMIGRATE) - child->ptrace |= PT_TRACE_MIGRATE; + if (ret == 0) { + unsigned int flags = child->ptrace; + flags &= ~(PTRACE_O_MASK_TILE << PT_OPT_FLAG_SHIFT); + flags |= (tmp << PT_OPT_FLAG_SHIFT); + child->ptrace = flags; + } break; default: @@ -160,32 +250,44 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } #endif -void do_syscall_trace(void) +int do_syscall_trace_enter(struct pt_regs *regs) { - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; + if (test_thread_flag(TIF_SYSCALL_TRACE)) { + if (tracehook_report_syscall_entry(regs)) + regs->regs[TREG_SYSCALL_NR] = -1; + } - if (!(current->ptrace & PT_PTRACED)) - return; + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, regs->regs[TREG_SYSCALL_NR]); - /* - * The 0x80 provides a way for the tracing parent to distinguish - * between a syscall stop and SIGTRAP delivery - */ - ptrace_notify(SIGTRAP|((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); + return regs->regs[TREG_SYSCALL_NR]; +} + +void do_syscall_trace_exit(struct pt_regs *regs) +{ + long errno; /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl + * The standard tile calling convention returns the value (or negative + * errno) in r0, and zero (or positive errno) in r1. + * It saves a couple of cycles on the hot path to do this work in + * registers only as we return, rather than updating the in-memory + * struct ptregs. */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } + errno = (long) regs->regs[0]; + if (errno < 0 && errno > -4096) + regs->regs[1] = -errno; + else + regs->regs[1] = 0; + + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, 0); + + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_exit(regs, regs->regs[0]); } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs) { struct siginfo info; @@ -201,5 +303,5 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) /* Handle synthetic interrupt delivered only by the simulator. */ void __kprobes do_breakpoint(struct pt_regs* regs, int fault_num) { - send_sigtrap(current, regs, fault_num); + send_sigtrap(current, regs); } diff --git a/arch/tile/kernel/reboot.c b/arch/tile/kernel/reboot.c index baa3d905fee..6c5d2c070a1 100644 --- a/arch/tile/kernel/reboot.c +++ b/arch/tile/kernel/reboot.c @@ -16,6 +16,7 @@ #include <linux/reboot.h> #include <linux/smp.h> #include <linux/pm.h> +#include <linux/export.h> #include <asm/page.h> #include <asm/setup.h> #include <hv/hypervisor.h> @@ -26,7 +27,6 @@ void machine_halt(void) { - warn_early_printk(); arch_local_irq_disable_all(); smp_send_stop(); hv_halt(); @@ -34,7 +34,6 @@ void machine_halt(void) void machine_power_off(void) { - warn_early_printk(); arch_local_irq_disable_all(); smp_send_stop(); hv_power_off(); @@ -49,3 +48,4 @@ void machine_restart(char *cmd) /* No interesting distinction to be made here. */ void (*pm_power_off)(void) = NULL; +EXPORT_SYMBOL(pm_power_off); diff --git a/arch/tile/kernel/regs_32.S b/arch/tile/kernel/regs_32.S index caa13101c26..542cae17a93 100644 --- a/arch/tile/kernel/regs_32.S +++ b/arch/tile/kernel/regs_32.S @@ -13,14 +13,14 @@ */ #include <linux/linkage.h> -#include <asm/system.h> #include <asm/ptrace.h> #include <asm/asm-offsets.h> #include <arch/spr_def.h> #include <asm/processor.h> +#include <asm/switch_to.h> /* - * See <asm/system.h>; called with prev and next task_struct pointers. + * See <asm/switch_to.h>; called with prev and next task_struct pointers. * "prev" is returned in r0 for _switch_to and also for ret_from_fork. * * We want to save pc/sp in "prev", and get the new pc/sp from "next". @@ -39,7 +39,7 @@ */ #if CALLEE_SAVED_REGS_COUNT != 24 -# error Mismatch between <asm/system.h> and kernel/entry.S +# error Mismatch between <asm/switch_to.h> and kernel/entry.S #endif #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 4) diff --git a/arch/tile/kernel/regs_64.S b/arch/tile/kernel/regs_64.S index f748c1e8528..bbffcc6f340 100644 --- a/arch/tile/kernel/regs_64.S +++ b/arch/tile/kernel/regs_64.S @@ -13,14 +13,14 @@ */ #include <linux/linkage.h> -#include <asm/system.h> #include <asm/ptrace.h> #include <asm/asm-offsets.h> #include <arch/spr_def.h> #include <asm/processor.h> +#include <asm/switch_to.h> /* - * See <asm/system.h>; called with prev and next task_struct pointers. + * See <asm/switch_to.h>; called with prev and next task_struct pointers. * "prev" is returned in r0 for _switch_to and also for ret_from_fork. * * We want to save pc/sp in "prev", and get the new pc/sp from "next". @@ -39,7 +39,7 @@ */ #if CALLEE_SAVED_REGS_COUNT != 24 -# error Mismatch between <asm/system.h> and kernel/entry.S +# error Mismatch between <asm/switch_to.h> and kernel/entry.S #endif #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 8) diff --git a/arch/tile/kernel/relocate_kernel.S b/arch/tile/kernel/relocate_kernel_32.S index 010b418515f..e44fbcf8cbd 100644 --- a/arch/tile/kernel/relocate_kernel.S +++ b/arch/tile/kernel/relocate_kernel_32.S @@ -20,15 +20,6 @@ #include <asm/page.h> #include <hv/hypervisor.h> -#define ___hvb MEM_SV_INTRPT + HV_GLUE_START_CPA - -#define ___hv_dispatch(f) (___hvb + (HV_DISPATCH_ENTRY_SIZE * f)) - -#define ___hv_console_putc ___hv_dispatch(HV_DISPATCH_CONSOLE_PUTC) -#define ___hv_halt ___hv_dispatch(HV_DISPATCH_HALT) -#define ___hv_reexec ___hv_dispatch(HV_DISPATCH_REEXEC) -#define ___hv_flush_remote ___hv_dispatch(HV_DISPATCH_FLUSH_REMOTE) - #undef RELOCATE_NEW_KERNEL_VERBOSE STD_ENTRY(relocate_new_kernel) @@ -43,8 +34,8 @@ STD_ENTRY(relocate_new_kernel) addi sp, sp, -8 /* we now have a stack (whether we need one or not) */ - moveli r40, lo16(___hv_console_putc) - auli r40, r40, ha16(___hv_console_putc) + moveli r40, lo16(hv_console_putc) + auli r40, r40, ha16(hv_console_putc) #ifdef RELOCATE_NEW_KERNEL_VERBOSE moveli r0, 'r' @@ -86,7 +77,6 @@ STD_ENTRY(relocate_new_kernel) move r30, sp addi sp, sp, -8 -#if CHIP_HAS_CBOX_HOME_MAP() /* * On TILEPro, we need to flush all tiles' caches, since we may * have been doing hash-for-home caching there. Note that we @@ -114,15 +104,14 @@ STD_ENTRY(relocate_new_kernel) } { move r8, zero /* asids */ - moveli r20, lo16(___hv_flush_remote) + moveli r20, lo16(hv_flush_remote) } { move r9, zero /* asidcount */ - auli r20, r20, ha16(___hv_flush_remote) + auli r20, r20, ha16(hv_flush_remote) } jalr r20 -#endif /* r33 is destination pointer, default to zero */ @@ -175,8 +164,8 @@ STD_ENTRY(relocate_new_kernel) move r0, r32 moveli r1, 0 /* arg to hv_reexec is 64 bits */ - moveli r41, lo16(___hv_reexec) - auli r41, r41, ha16(___hv_reexec) + moveli r41, lo16(hv_reexec) + auli r41, r41, ha16(hv_reexec) jalr r41 @@ -267,8 +256,8 @@ STD_ENTRY(relocate_new_kernel) moveli r0, '\n' jalr r40 .Lhalt: - moveli r41, lo16(___hv_halt) - auli r41, r41, ha16(___hv_halt) + moveli r41, lo16(hv_halt) + auli r41, r41, ha16(hv_halt) jalr r41 STD_ENDPROC(relocate_new_kernel) diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S new file mode 100644 index 00000000000..d9d8cf6176e --- /dev/null +++ b/arch/tile/kernel/relocate_kernel_64.S @@ -0,0 +1,263 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * copy new kernel into place and then call hv_reexec + * + */ + +#include <linux/linkage.h> +#include <arch/chip.h> +#include <asm/page.h> +#include <hv/hypervisor.h> + +#undef RELOCATE_NEW_KERNEL_VERBOSE + +STD_ENTRY(relocate_new_kernel) + + move r30, r0 /* page list */ + move r31, r1 /* address of page we are on */ + move r32, r2 /* start address of new kernel */ + + shrui r1, r1, PAGE_SHIFT + addi r1, r1, 1 + shli sp, r1, PAGE_SHIFT + addi sp, sp, -8 + /* we now have a stack (whether we need one or not) */ + +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r40, hw2_last(hv_console_putc) + shl16insli r40, r40, hw1(hv_console_putc) + shl16insli r40, r40, hw0(hv_console_putc) + + moveli r0, 'r' + jalr r40 + + moveli r0, '_' + jalr r40 + + moveli r0, 'n' + jalr r40 + + moveli r0, '_' + jalr r40 + + moveli r0, 'k' + jalr r40 + + moveli r0, '\n' + jalr r40 +#endif + + /* + * Throughout this code r30 is pointer to the element of page + * list we are working on. + * + * Normally we get to the next element of the page list by + * incrementing r30 by eight. The exception is if the element + * on the page list is an IND_INDIRECTION in which case we use + * the element with the low bits masked off as the new value + * of r30. + * + * To get this started, we need the value passed to us (which + * will always be an IND_INDIRECTION) in memory somewhere with + * r30 pointing at it. To do that, we push the value passed + * to us on the stack and make r30 point to it. + */ + + st sp, r30 + move r30, sp + addi sp, sp, -16 + + /* + * On TILE-GX, we need to flush all tiles' caches, since we may + * have been doing hash-for-home caching there. Note that we + * must do this _after_ we're completely done modifying any memory + * other than our output buffer (which we know is locally cached). + * We want the caches to be fully clean when we do the reexec, + * because the hypervisor is going to do this flush again at that + * point, and we don't want that second flush to overwrite any memory. + */ + { + move r0, zero /* cache_pa */ + moveli r1, hw2_last(HV_FLUSH_EVICT_L2) + } + { + shl16insli r1, r1, hw1(HV_FLUSH_EVICT_L2) + movei r2, -1 /* cache_cpumask; -1 means all client tiles */ + } + { + shl16insli r1, r1, hw0(HV_FLUSH_EVICT_L2) /* cache_control */ + move r3, zero /* tlb_va */ + } + { + move r4, zero /* tlb_length */ + move r5, zero /* tlb_pgsize */ + } + { + move r6, zero /* tlb_cpumask */ + move r7, zero /* asids */ + } + { + moveli r20, hw2_last(hv_flush_remote) + move r8, zero /* asidcount */ + } + shl16insli r20, r20, hw1(hv_flush_remote) + shl16insli r20, r20, hw0(hv_flush_remote) + + jalr r20 + + /* r33 is destination pointer, default to zero */ + + moveli r33, 0 + +.Lloop: ld r10, r30 + + andi r9, r10, 0xf /* low 4 bits tell us what type it is */ + xor r10, r10, r9 /* r10 is now value with low 4 bits stripped */ + + cmpeqi r0, r9, 0x1 /* IND_DESTINATION */ + beqzt r0, .Ltry2 + + move r33, r10 + +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, 'd' + jalr r40 +#endif + + addi r30, r30, 8 + j .Lloop + +.Ltry2: + cmpeqi r0, r9, 0x2 /* IND_INDIRECTION */ + beqzt r0, .Ltry4 + + move r30, r10 + +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, 'i' + jalr r40 +#endif + + j .Lloop + +.Ltry4: + cmpeqi r0, r9, 0x4 /* IND_DONE */ + beqzt r0, .Ltry8 + + mf + +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, 'D' + jalr r40 + moveli r0, '\n' + jalr r40 +#endif + + move r0, r32 + + moveli r41, hw2_last(hv_reexec) + shl16insli r41, r41, hw1(hv_reexec) + shl16insli r41, r41, hw0(hv_reexec) + + jalr r41 + + /* we should not get here */ + +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, '?' + jalr r40 + moveli r0, '\n' + jalr r40 +#endif + + j .Lhalt + +.Ltry8: cmpeqi r0, r9, 0x8 /* IND_SOURCE */ + beqz r0, .Lerr /* unknown type */ + + /* copy page at r10 to page at r33 */ + + move r11, r33 + + moveli r0, hw2_last(PAGE_SIZE) + shl16insli r0, r0, hw1(PAGE_SIZE) + shl16insli r0, r0, hw0(PAGE_SIZE) + add r33, r33, r0 + + /* copy word at r10 to word at r11 until r11 equals r33 */ + + /* We know page size must be multiple of 8, so we can unroll + * 8 times safely without any edge case checking. + * + * Issue a flush of the destination every 8 words to avoid + * incoherence when starting the new kernel. (Now this is + * just good paranoia because the hv_reexec call will also + * take care of this.) + */ + +1: + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0; addi r11, r11, 8 } + { ld r0, r10; addi r10, r10, 8 } + { st r11, r0 } + { flush r11 ; addi r11, r11, 8 } + + cmpeq r0, r33, r11 + beqzt r0, 1b + +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, 's' + jalr r40 +#endif + + addi r30, r30, 8 + j .Lloop + + +.Lerr: +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, 'e' + jalr r40 + moveli r0, 'r' + jalr r40 + moveli r0, 'r' + jalr r40 + moveli r0, '\n' + jalr r40 +#endif +.Lhalt: + moveli r41, hw2_last(hv_halt) + shl16insli r41, r41, hw1(hv_halt) + shl16insli r41, r41, hw0(hv_halt) + + jalr r41 + STD_ENDPROC(relocate_new_kernel) + + .section .rodata,"a" + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long .Lend_relocate_new_kernel - relocate_new_kernel diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 5f85d8b34db..112ababa9e5 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -23,11 +23,15 @@ #include <linux/irq.h> #include <linux/kexec.h> #include <linux/pci.h> +#include <linux/swiotlb.h> #include <linux/initrd.h> #include <linux/io.h> #include <linux/highmem.h> #include <linux/smp.h> #include <linux/timex.h> +#include <linux/hugetlb.h> +#include <linux/start_kernel.h> +#include <linux/screen_info.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -46,24 +50,41 @@ static inline int ABS(int x) { return x >= 0 ? x : -x; } /* Chip information */ char chip_model[64] __write_once; +#ifdef CONFIG_VT +struct screen_info screen_info; +#endif + struct pglist_data node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -/* We only create bootmem data on node 0. */ -static bootmem_data_t __initdata node0_bdata; - /* Information on the NUMA nodes that we compute early */ -unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES]; -unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES]; +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; unsigned long __initdata node_memmap_pfn[MAX_NUMNODES]; unsigned long __initdata node_percpu_pfn[MAX_NUMNODES]; unsigned long __initdata node_free_pfn[MAX_NUMNODES]; static unsigned long __initdata node_percpu[MAX_NUMNODES]; +/* + * per-CPU stack and boot info. + */ +DEFINE_PER_CPU(unsigned long, boot_sp) = + (unsigned long)init_stack + THREAD_SIZE; + +#ifdef CONFIG_SMP +DEFINE_PER_CPU(unsigned long, boot_pc) = (unsigned long)start_kernel; +#else +/* + * The variable must be __initdata since it references __init code. + * With CONFIG_SMP it is per-cpu data, which is exempt from validation. + */ +unsigned long __initdata boot_pc = (unsigned long)start_kernel; +#endif + #ifdef CONFIG_HIGHMEM /* Page frame index of end of lowmem on each controller. */ -unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES]; +unsigned long node_lowmem_end_pfn[MAX_NUMNODES]; /* Number of pages that can be mapped into lowmem. */ static unsigned long __initdata mappable_physpages; @@ -94,7 +115,7 @@ static unsigned int __initdata maxnodemem_pfn[MAX_NUMNODES] = { }; static nodemask_t __initdata isolnodes; -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) enum { DEFAULT_PCI_RESERVE_MB = 64 }; static unsigned int __initdata pci_reserve_mb = DEFAULT_PCI_RESERVE_MB; unsigned long __initdata pci_reserve_start_pfn = -1U; @@ -103,13 +124,11 @@ unsigned long __initdata pci_reserve_end_pfn = -1U; static int __init setup_maxmem(char *str) { - long maxmem_mb; - if (str == NULL || strict_strtol(str, 0, &maxmem_mb) != 0 || - maxmem_mb == 0) + unsigned long long maxmem; + if (str == NULL || (maxmem = memparse(str, NULL)) == 0) return -EINVAL; - maxmem_pfn = (maxmem_mb >> (HPAGE_SHIFT - 20)) << - (HPAGE_SHIFT - PAGE_SHIFT); + maxmem_pfn = (maxmem >> HPAGE_SHIFT) << (HPAGE_SHIFT - PAGE_SHIFT); pr_info("Forcing RAM used to no more than %dMB\n", maxmem_pfn >> (20 - PAGE_SHIFT)); return 0; @@ -119,14 +138,15 @@ early_param("maxmem", setup_maxmem); static int __init setup_maxnodemem(char *str) { char *endp; - long maxnodemem_mb, node; + unsigned long long maxnodemem; + long node; node = str ? simple_strtoul(str, &endp, 0) : INT_MAX; - if (node >= MAX_NUMNODES || *endp != ':' || - strict_strtol(endp+1, 0, &maxnodemem_mb) != 0) + if (node >= MAX_NUMNODES || *endp != ':') return -EINVAL; - maxnodemem_pfn[node] = (maxnodemem_mb >> (HPAGE_SHIFT - 20)) << + maxnodemem = memparse(endp+1, NULL); + maxnodemem_pfn[node] = (maxnodemem >> HPAGE_SHIFT) << (HPAGE_SHIFT - PAGE_SHIFT); pr_info("Forcing RAM used on node %ld to no more than %dMB\n", node, maxnodemem_pfn[node] >> (20 - PAGE_SHIFT)); @@ -134,6 +154,65 @@ static int __init setup_maxnodemem(char *str) } early_param("maxnodemem", setup_maxnodemem); +struct memmap_entry { + u64 addr; /* start of memory segment */ + u64 size; /* size of memory segment */ +}; +static struct memmap_entry memmap_map[64]; +static int memmap_nr; + +static void add_memmap_region(u64 addr, u64 size) +{ + if (memmap_nr >= ARRAY_SIZE(memmap_map)) { + pr_err("Ooops! Too many entries in the memory map!\n"); + return; + } + memmap_map[memmap_nr].addr = addr; + memmap_map[memmap_nr].size = size; + memmap_nr++; +} + +static int __init setup_memmap(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + return -EINVAL; + + if (!strncmp(p, "exactmap", 8)) { + pr_err("\"memmap=exactmap\" not valid on tile\n"); + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + if (*p == '@') { + pr_err("\"memmap=nn@ss\" (force RAM) invalid on tile\n"); + } else if (*p == '#') { + pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on tile\n"); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + add_memmap_region(start_at, mem_size); + } else { + if (mem_size == 0) + return -EINVAL; + maxmem_pfn = (mem_size >> HPAGE_SHIFT) << + (HPAGE_SHIFT - PAGE_SHIFT); + } + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", setup_memmap); + +static int __init setup_mem(char *str) +{ + return setup_maxmem(str); +} +early_param("mem", setup_mem); /* compatibility with x86 */ + static int __init setup_isolnodes(char *str) { char buf[MAX_NUMNODES * 5]; @@ -146,18 +225,15 @@ static int __init setup_isolnodes(char *str) } early_param("isolnodes", setup_isolnodes); -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) static int __init setup_pci_reserve(char* str) { - unsigned long mb; - - if (str == NULL || strict_strtoul(str, 0, &mb) != 0 || - mb > 3 * 1024) + if (str == NULL || kstrtouint(str, 0, &pci_reserve_mb) != 0 || + pci_reserve_mb > 3 * 1024) return -EINVAL; - pci_reserve_mb = mb; pr_info("Reserving %dMB for PCIE root complex mappings\n", - pci_reserve_mb); + pci_reserve_mb); return 0; } early_param("pci_reserve", setup_pci_reserve); @@ -189,7 +265,7 @@ early_param("vmalloc", parse_vmalloc); /* * Determine for each controller where its lowmem is mapped and how much of * it is mapped there. On controller zero, the first few megabytes are - * already mapped in as code at MEM_SV_INTRPT, so in principle we could + * already mapped in as code at MEM_SV_START, so in principle we could * start our data mappings higher up, but for now we don't bother, to avoid * additional confusion. * @@ -270,7 +346,7 @@ static void *__init setup_pa_va_mapping(void) * This is up to 4 mappings for lowmem, one mapping per memory * controller, plus one for our text segment. */ -static void __cpuinit store_permanent_mappings(void) +static void store_permanent_mappings(void) { int i; @@ -287,8 +363,8 @@ static void __cpuinit store_permanent_mappings(void) hv_store_mapping(addr, pages << PAGE_SHIFT, pa); } - hv_store_mapping((HV_VirtAddr)_stext, - (uint32_t)(_einittext - _stext), 0); + hv_store_mapping((HV_VirtAddr)_text, + (uint32_t)(_einittext - _text), 0); } /* @@ -309,6 +385,7 @@ static void __init setup_memory(void) #if defined(CONFIG_HIGHMEM) || defined(__tilegx__) long lowmem_pages; #endif + unsigned long physpages = 0; /* We are using a char to hold the cpu_2_node[] mapping */ BUILD_BUG_ON(MAX_NUMNODES > 127); @@ -368,8 +445,8 @@ static void __init setup_memory(void) continue; } } - if (num_physpages + PFN_DOWN(range.size) > maxmem_pfn) { - int max_size = maxmem_pfn - num_physpages; + if (physpages + PFN_DOWN(range.size) > maxmem_pfn) { + int max_size = maxmem_pfn - physpages; if (max_size > 0) { pr_err("Maxmem reduced node %d to %d pages\n", i, max_size); @@ -397,7 +474,7 @@ static void __init setup_memory(void) continue; } #endif -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) /* * Blocks that overlap the pci reserved region must * have enough space to hold the maximum percpu data @@ -426,7 +503,7 @@ static void __init setup_memory(void) node_start_pfn[i] = start; node_end_pfn[i] = end; node_controller[i] = range.controller; - num_physpages += size; + physpages += size; max_pfn = end; /* Mark node as online */ @@ -445,7 +522,7 @@ static void __init setup_memory(void) * we're willing to use at 8 million pages (32GB of 4KB pages). */ cap = 8 * 1024 * 1024; /* 8 million pages */ - if (num_physpages > cap) { + if (physpages > cap) { int num_nodes = num_online_nodes(); int cap_each = cap / num_nodes; unsigned long dropped_pages = 0; @@ -456,10 +533,10 @@ static void __init setup_memory(void) node_end_pfn[i] = node_start_pfn[i] + cap_each; } } - num_physpages -= dropped_pages; + physpages -= dropped_pages; pr_warning("Only using %ldMB memory;" " ignoring %ldMB.\n", - num_physpages >> (20 - PAGE_SHIFT), + physpages >> (20 - PAGE_SHIFT), dropped_pages >> (20 - PAGE_SHIFT)); pr_warning("Consider using a larger page size.\n"); } @@ -477,7 +554,7 @@ static void __init setup_memory(void) lowmem_pages = (mappable_physpages > MAXMEM_PFN) ? MAXMEM_PFN : mappable_physpages; - highmem_pages = (long) (num_physpages - lowmem_pages); + highmem_pages = (long) (physpages - lowmem_pages); pr_notice("%ldMB HIGHMEM available.\n", pages_to_mb(highmem_pages > 0 ? highmem_pages : 0)); @@ -494,7 +571,6 @@ static void __init setup_memory(void) pr_warning("Use a HIGHMEM enabled kernel.\n"); max_low_pfn = MAXMEM_PFN; max_pfn = MAXMEM_PFN; - num_physpages = MAXMEM_PFN; node_end_pfn[0] = MAXMEM_PFN; } else { pr_notice("%ldMB memory available.\n", @@ -519,41 +595,125 @@ static void __init setup_memory(void) #endif } -static void __init setup_bootmem_allocator(void) +/* + * On 32-bit machines, we only put bootmem on the low controller, + * since PAs > 4GB can't be used in bootmem. In principle one could + * imagine, e.g., multiple 1 GB controllers all of which could support + * bootmem, but in practice using controllers this small isn't a + * particularly interesting scenario, so we just keep it simple and + * use only the first controller for bootmem on 32-bit machines. + */ +static inline int node_has_bootmem(int nid) +{ +#ifdef CONFIG_64BIT + return 1; +#else + return nid == 0; +#endif +} + +static inline unsigned long alloc_bootmem_pfn(int nid, + unsigned long size, + unsigned long goal) +{ + void *kva = __alloc_bootmem_node(NODE_DATA(nid), size, + PAGE_SIZE, goal); + unsigned long pfn = kaddr_to_pfn(kva); + BUG_ON(goal && PFN_PHYS(pfn) != goal); + return pfn; +} + +static void __init setup_bootmem_allocator_node(int i) { - unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn; + unsigned long start, end, mapsize, mapstart; + + if (node_has_bootmem(i)) { + NODE_DATA(i)->bdata = &bootmem_node_data[i]; + } else { + /* Share controller zero's bdata for now. */ + NODE_DATA(i)->bdata = &bootmem_node_data[0]; + return; + } - /* Provide a node 0 bdata. */ - NODE_DATA(0)->bdata = &node0_bdata; + /* Skip up to after the bss in node 0. */ + start = (i == 0) ? min_low_pfn : node_start_pfn[i]; -#ifdef CONFIG_PCI - /* Don't let boot memory alias the PCI region. */ - last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn); + /* Only lowmem, if we're a HIGHMEM build. */ +#ifdef CONFIG_HIGHMEM + end = node_lowmem_end_pfn[i]; #else - last_alloc_pfn = max_low_pfn; + end = node_end_pfn[i]; #endif - /* - * Initialize the boot-time allocator (with low memory only): - * The first argument says where to put the bitmap, and the - * second says where the end of allocatable memory is. - */ - bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn); + /* No memory here. */ + if (end == start) + return; + + /* Figure out where the bootmem bitmap is located. */ + mapsize = bootmem_bootmap_pages(end - start); + if (i == 0) { + /* Use some space right before the heap on node 0. */ + mapstart = start; + start += mapsize; + } else { + /* Allocate bitmap on node 0 to avoid page table issues. */ + mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0); + } + + /* Initialize a node. */ + init_bootmem_node(NODE_DATA(i), mapstart, start, end); + /* Free all the space back into the allocator. */ + free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start)); + +#if defined(CONFIG_PCI) && !defined(__tilegx__) /* - * Let the bootmem allocator use all the space we've given it - * except for its own bitmap. + * Throw away any memory aliased by the PCI region. */ - first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size); - if (first_alloc_pfn >= last_alloc_pfn) - early_panic("Not enough memory on controller 0 for bootmem\n"); + if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) { + start = max(pci_reserve_start_pfn, start); + end = min(pci_reserve_end_pfn, end); + reserve_bootmem(PFN_PHYS(start), PFN_PHYS(end - start), + BOOTMEM_EXCLUSIVE); + } +#endif +} + +static void __init setup_bootmem_allocator(void) +{ + int i; + for (i = 0; i < MAX_NUMNODES; ++i) + setup_bootmem_allocator_node(i); + + /* Reserve any memory excluded by "memmap" arguments. */ + for (i = 0; i < memmap_nr; ++i) { + struct memmap_entry *m = &memmap_map[i]; + reserve_bootmem(m->addr, m->size, BOOTMEM_DEFAULT); + } - free_bootmem(PFN_PHYS(first_alloc_pfn), - PFN_PHYS(last_alloc_pfn - first_alloc_pfn)); +#ifdef CONFIG_BLK_DEV_INITRD + if (initrd_start) { + /* Make sure the initrd memory region is not modified. */ + if (reserve_bootmem(initrd_start, initrd_end - initrd_start, + BOOTMEM_EXCLUSIVE)) { + pr_crit("The initrd memory region has been polluted. Disabling it.\n"); + initrd_start = 0; + initrd_end = 0; + } else { + /* + * Translate initrd_start & initrd_end from PA to VA for + * future access. + */ + initrd_start += PAGE_OFFSET; + initrd_end += PAGE_OFFSET; + } + } +#endif #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) - reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0); + reserve_bootmem(crashk_res.start, resource_size(&crashk_res), + BOOTMEM_DEFAULT); #endif } @@ -580,19 +740,13 @@ static int __init percpu_size(void) return size; } -static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal) -{ - void *kva = __alloc_bootmem(size, PAGE_SIZE, goal); - unsigned long pfn = kaddr_to_pfn(kva); - BUG_ON(goal && PFN_PHYS(pfn) != goal); - return pfn; -} - static void __init zone_sizes_init(void) { unsigned long zones_size[MAX_NR_ZONES] = { 0 }; int size = percpu_size(); int num_cpus = smp_height * smp_width; + const unsigned long dma_end = (1UL << (32 - PAGE_SHIFT)); + int i; for (i = 0; i < num_cpus; ++i) @@ -625,21 +779,22 @@ static void __init zone_sizes_init(void) * though, there'll be no lowmem, so we just alloc_bootmem * the memmap. There will be no percpu memory either. */ - if (__pfn_to_highbits(start) == 0) { - /* In low PAs, allocate via bootmem. */ + if (i != 0 && cpu_isset(i, isolnodes)) { + node_memmap_pfn[i] = + alloc_bootmem_pfn(0, memmap_size, 0); + BUG_ON(node_percpu[i] != 0); + } else if (node_has_bootmem(start)) { unsigned long goal = 0; node_memmap_pfn[i] = - alloc_bootmem_pfn(memmap_size, goal); + alloc_bootmem_pfn(i, memmap_size, 0); if (kdata_huge) goal = PFN_PHYS(lowmem_end) - node_percpu[i]; if (node_percpu[i]) node_percpu_pfn[i] = - alloc_bootmem_pfn(node_percpu[i], goal); - } else if (cpu_isset(i, isolnodes)) { - node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0); - BUG_ON(node_percpu[i] != 0); + alloc_bootmem_pfn(i, node_percpu[i], + goal); } else { - /* In high PAs, just reserve some pages. */ + /* In non-bootmem zones, just reserve some pages. */ node_memmap_pfn[i] = node_free_pfn[i]; node_free_pfn[i] += PFN_UP(memmap_size); if (!kdata_huge) { @@ -663,23 +818,24 @@ static void __init zone_sizes_init(void) zones_size[ZONE_NORMAL] = end - start; #endif - /* - * Everyone shares node 0's bootmem allocator, but - * we use alloc_remap(), above, to put the actual - * struct page array on the individual controllers, - * which is most of the data that we actually care about. - * We can't place bootmem allocators on the other - * controllers since the bootmem allocator can only - * operate on 32-bit physical addresses. - */ - NODE_DATA(i)->bdata = NODE_DATA(0)->bdata; + if (start < dma_end) { + zones_size[ZONE_DMA] = min(zones_size[ZONE_NORMAL], + dma_end - start); + zones_size[ZONE_NORMAL] -= zones_size[ZONE_DMA]; + } else { + zones_size[ZONE_DMA] = 0; + } + + /* Take zone metadata from controller 0 if we're isolnode. */ + if (node_isset(i, isolnodes)) + NODE_DATA(i)->bdata = &bootmem_node_data[0]; free_area_init_node(i, zones_size, start, NULL); printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n", PFN_UP(node_percpu[i])); /* Track the type of memory on each node */ - if (zones_size[ZONE_NORMAL]) + if (zones_size[ZONE_NORMAL] || zones_size[ZONE_DMA]) node_set_state(i, N_NORMAL_MEMORY); #ifdef CONFIG_HIGHMEM if (end != start) @@ -855,13 +1011,29 @@ subsys_initcall(topology_init); #endif /* CONFIG_NUMA */ +/* + * Initialize hugepage support on this cpu. We do this on all cores + * early in boot: before argument parsing for the boot cpu, and after + * argument parsing but before the init functions run on the secondaries. + * So the values we set up here in the hypervisor may be overridden on + * the boot cpu as arguments are parsed. + */ +static void init_super_pages(void) +{ +#ifdef CONFIG_HUGETLB_SUPER_PAGES + int i; + for (i = 0; i < HUGE_SHIFT_ENTRIES; ++i) + hv_set_pte_super_shift(i, huge_shift[i]); +#endif +} + /** * setup_cpu() - Do all necessary per-cpu, tile-specific initialization. * @boot: Is this the boot cpu? * * Called from setup_arch() on the boot cpu, or online_secondary(). */ -void __cpuinit setup_cpu(int boot) +void setup_cpu(int boot) { /* The boot cpu sets up its permanent mappings much earlier. */ if (!boot) @@ -872,9 +1044,6 @@ void __cpuinit setup_cpu(int boot) arch_local_irq_unmask(INT_DMATLB_MISS); arch_local_irq_unmask(INT_DMATLB_ACCESS); #endif -#if CHIP_HAS_SN_PROC() - arch_local_irq_unmask(INT_SNITLB_MISS); -#endif #ifdef __tilegx__ arch_local_irq_unmask(INT_SINGLE_STEP_K); #endif @@ -889,10 +1058,6 @@ void __cpuinit setup_cpu(int boot) /* Static network is not restricted. */ __insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1); #endif -#if CHIP_HAS_SN_PROC() - __insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1); - __insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1); -#endif /* * Set the MPL for interrupt control 0 & 1 to the corresponding @@ -909,12 +1074,14 @@ void __cpuinit setup_cpu(int boot) /* Reset the network state on this cpu. */ reset_network_state(); #endif + + init_super_pages(); } #ifdef CONFIG_BLK_DEV_INITRD static int __initdata set_initramfs_file; -static char __initdata initramfs_file[128] = "initramfs.cpio.gz"; +static char __initdata initramfs_file[128] = "initramfs"; static int __init setup_initramfs_file(char *str) { @@ -928,9 +1095,9 @@ static int __init setup_initramfs_file(char *str) early_param("initramfs_file", setup_initramfs_file); /* - * We look for an additional "initramfs.cpio.gz" file in the hvfs. - * If there is one, we allocate some memory for it and it will be - * unpacked to the initramfs after any built-in initramfs_data. + * We look for a file called "initramfs" in the hvfs. If there is one, we + * allocate some memory for it and it will be unpacked to the initramfs. + * If it's compressed, the initd code will uncompress it first. */ static void __init load_hv_initrd(void) { @@ -938,12 +1105,22 @@ static void __init load_hv_initrd(void) int fd, rc; void *initrd; + /* If initrd has already been set, skip initramfs file in hvfs. */ + if (initrd_start) + return; + fd = hv_fs_findfile((HV_VirtAddr) initramfs_file); if (fd == HV_ENOENT) { - if (set_initramfs_file) + if (set_initramfs_file) { pr_warning("No such hvfs initramfs file '%s'\n", initramfs_file); - return; + return; + } else { + /* Try old backwards-compatible name. */ + fd = hv_fs_findfile((HV_VirtAddr)"initramfs.cpio.gz"); + if (fd == HV_ENOENT) + return; + } } BUG_ON(fd < 0); stat = hv_fs_fstat(fd); @@ -970,6 +1147,25 @@ void __init free_initrd_mem(unsigned long begin, unsigned long end) free_bootmem(__pa(begin), end - begin); } +static int __init setup_initrd(char *str) +{ + char *endp; + unsigned long initrd_size; + + initrd_size = str ? simple_strtoul(str, &endp, 0) : 0; + if (initrd_size == 0 || *endp != '@') + return -EINVAL; + + initrd_start = simple_strtoul(endp+1, &endp, 0); + if (initrd_start == 0) + return -EINVAL; + + initrd_end = initrd_start + initrd_size; + + return 0; +} +early_param("initrd", setup_initrd); + #else static inline void load_hv_initrd(void) {} #endif /* CONFIG_BLK_DEV_INITRD */ @@ -1037,7 +1233,7 @@ static void __init validate_va(void) #ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */ /* * Similarly, make sure we're only using allowed VAs. - * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT, + * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START, * and 0 .. KERNEL_HIGH_VADDR. * In addition, make sure we CAN'T use the end of memory, since * we use the last chunk of each pgd for the pgd_list. @@ -1052,7 +1248,7 @@ static void __init validate_va(void) if (range.size == 0) break; if (range.start <= MEM_USER_INTRPT && - range.start + range.size >= MEM_HV_INTRPT) + range.start + range.size >= MEM_HV_START) user_kernel_ok = 1; if (range.start == 0) max_va = range.size; @@ -1070,8 +1266,7 @@ static void __init validate_va(void) if ((long)VMALLOC_START >= 0) early_panic( "Linux VMALLOC region below the 2GB line (%#lx)!\n" - "Reconfigure the kernel with fewer NR_HUGE_VMAPS\n" - "or smaller VMALLOC_RESERVE.\n", + "Reconfigure the kernel with smaller VMALLOC_RESERVE.\n", VMALLOC_START); #endif } @@ -1086,7 +1281,6 @@ static void __init validate_va(void) struct cpumask __write_once cpu_lotar_map; EXPORT_SYMBOL(cpu_lotar_map); -#if CHIP_HAS_CBOX_HOME_MAP() /* * hash_for_home_map lists all the tiles that hash-for-home data * will be cached on. Note that this may includes tiles that are not @@ -1096,11 +1290,10 @@ EXPORT_SYMBOL(cpu_lotar_map); */ struct cpumask hash_for_home_map; EXPORT_SYMBOL(hash_for_home_map); -#endif /* * cpu_cacheable_map lists all the cpus whose caches the hypervisor can - * flush on our behalf. It is set to cpu_possible_map OR'ed with + * flush on our behalf. It is set to cpu_possible_mask OR'ed with * hash_for_home_map, and it is what should be passed to * hv_flush_remote() to flush all caches. Note that if there are * dedicated hypervisor driver tiles that have authorized use of their @@ -1186,20 +1379,16 @@ static void __init setup_cpu_maps(void) sizeof(cpu_lotar_map)); if (rc < 0) { pr_err("warning: no HV_INQ_TILES_LOTAR; using AVAIL\n"); - cpu_lotar_map = cpu_possible_map; + cpu_lotar_map = *cpu_possible_mask; } -#if CHIP_HAS_CBOX_HOME_MAP() /* Retrieve set of CPUs used for hash-for-home caching */ rc = hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE, (HV_VirtAddr) hash_for_home_map.bits, sizeof(hash_for_home_map)); if (rc < 0) early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc); - cpumask_or(&cpu_cacheable_map, &cpu_possible_map, &hash_for_home_map); -#else - cpu_cacheable_map = cpu_possible_map; -#endif + cpumask_or(&cpu_cacheable_map, cpu_possible_mask, &hash_for_home_map); } @@ -1259,7 +1448,7 @@ void __init setup_arch(char **cmdline_p) setup_cpu_maps(); -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) /* * Initialize the PCI structures. This is done before memory * setup so that we know whether or not a pci_reserve region @@ -1288,6 +1477,10 @@ void __init setup_arch(char **cmdline_p) * any memory using the bootmem allocator. */ +#ifdef CONFIG_SWIOTLB + swiotlb_init(0); +#endif + paging_init(); setup_numa_mapping(); zone_sizes_init(); @@ -1390,26 +1583,26 @@ void __init setup_per_cpu_areas(void) for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) { /* Update the vmalloc mapping and page home. */ - pte_t *ptep = - virt_to_pte(NULL, (unsigned long)ptr + i); + unsigned long addr = (unsigned long)ptr + i; + pte_t *ptep = virt_to_kpte(addr); pte_t pte = *ptep; BUG_ON(pfn != pte_pfn(pte)); pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3); pte = set_remote_cache_cpu(pte, cpu); - set_pte(ptep, pte); + set_pte_at(&init_mm, addr, ptep, pte); /* Update the lowmem mapping for consistency. */ lowmem_va = (unsigned long)pfn_to_kaddr(pfn); - ptep = virt_to_pte(NULL, lowmem_va); + ptep = virt_to_kpte(lowmem_va); if (pte_huge(*ptep)) { printk(KERN_DEBUG "early shatter of huge page" " at %#lx\n", lowmem_va); shatter_pmd((pmd_t *)ptep); - ptep = virt_to_pte(NULL, lowmem_va); + ptep = virt_to_kpte(lowmem_va); BUG_ON(pte_huge(*ptep)); } BUG_ON(pfn != pte_pfn(*ptep)); - set_pte(ptep, pte); + set_pte_at(&init_mm, lowmem_va, ptep, pte); } } @@ -1438,16 +1631,17 @@ static struct resource code_resource = { }; /* - * We reserve all resources above 4GB so that PCI won't try to put - * mappings above 4GB; the standard allows that for some devices but - * the probing code trunates values to 32 bits. + * On Pro, we reserve all resources above 4GB so that PCI won't try to put + * mappings above 4GB. */ -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) static struct resource* __init insert_non_bus_resource(void) { struct resource *res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + if (!res) + return NULL; res->name = "Non-Bus Physical Address Space"; res->start = (1ULL << 32); res->end = -1LL; @@ -1461,11 +1655,13 @@ insert_non_bus_resource(void) #endif static struct resource* __init -insert_ram_resource(u64 start_pfn, u64 end_pfn) +insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved) { struct resource *res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - res->name = "System RAM"; + if (!res) + return NULL; + res->name = reserved ? "Reserved" : "System RAM"; res->start = start_pfn << PAGE_SHIFT; res->end = (end_pfn << PAGE_SHIFT) - 1; res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; @@ -1485,10 +1681,9 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn) static int __init request_standard_resources(void) { int i; - enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET }; + enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET }; - iomem_resource.end = -1LL; -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) insert_non_bus_resource(); #endif @@ -1496,16 +1691,16 @@ static int __init request_standard_resources(void) u64 start_pfn = node_start_pfn[i]; u64 end_pfn = node_end_pfn[i]; -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && !defined(__tilegx__) if (start_pfn <= pci_reserve_start_pfn && end_pfn > pci_reserve_start_pfn) { if (end_pfn > pci_reserve_end_pfn) insert_ram_resource(pci_reserve_end_pfn, - end_pfn); + end_pfn, 0); end_pfn = pci_reserve_start_pfn; } #endif - insert_ram_resource(start_pfn, end_pfn); + insert_ram_resource(start_pfn, end_pfn, 0); } code_resource.start = __pa(_text - CODE_DELTA); @@ -1516,6 +1711,13 @@ static int __init request_standard_resources(void) insert_resource(&iomem_resource, &code_resource); insert_resource(&iomem_resource, &data_resource); + /* Mark any "memmap" regions busy for the resource manager. */ + for (i = 0; i < memmap_nr; ++i) { + struct memmap_entry *m = &memmap_map[i]; + insert_ram_resource(PFN_DOWN(m->addr), + PFN_UP(m->addr + m->size - 1), 1); + } + #ifdef CONFIG_KEXEC insert_resource(&iomem_resource, &crashk_res); #endif diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index bedaf4e9f3a..d1d026f0126 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -33,19 +33,11 @@ #include <asm/ucontext.h> #include <asm/sigframe.h> #include <asm/syscalls.h> +#include <asm/vdso.h> #include <arch/interrupts.h> #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss, - stack_t __user *, uoss, struct pt_regs *, regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} - - /* * Do a signal return; undo the signal stack. */ @@ -85,8 +77,9 @@ void signal_fault(const char *type, struct pt_regs *regs, } /* The assembly shim for this function arranges to ignore the return value. */ -SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) +SYSCALL_DEFINE0(rt_sigreturn) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp); sigset_t set; @@ -96,16 +89,12 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; return 0; @@ -196,17 +185,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __clear_user(&frame->save_area, sizeof(frame->save_area)); err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(NULL, &frame->uc.uc_link); - err |= __put_user((void __user *)(current->sas_ss_sp), - &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto give_sigsegv; - restorer = VDSO_BASE; + restorer = VDSO_SYM(&__vdso_rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; @@ -225,15 +210,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->regs[1] = (unsigned long) &frame->info; regs->regs[2] = (unsigned long) &frame->uc; regs->flags |= PT_FLAGS_CALLER_SAVES; - - /* - * Notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - return 0; give_sigsegv: @@ -245,10 +221,11 @@ give_sigsegv: * OK, we're invoking a handler */ -static int handle_signal(unsigned long sig, siginfo_t *info, - struct k_sigaction *ka, sigset_t *oldset, +static void handle_signal(unsigned long sig, siginfo_t *info, + struct k_sigaction *ka, struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Are we from a system call? */ @@ -281,21 +258,10 @@ static int handle_signal(unsigned long sig, siginfo_t *info, else #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* This code is only called from system calls or from - * the work_pending path in the return-to-user code, and - * either way we can re-enable interrupts unconditionally. - */ - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, - ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } - - return ret; + if (ret) + return; + signal_delivered(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } /* @@ -308,7 +274,6 @@ void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * i386 will check if we're coming from kernel mode and bail out @@ -317,24 +282,10 @@ void do_signal(struct pt_regs *regs) * helpful, we can reinstate the check on "!user_mode(regs)". */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } - + handle_signal(signr, &info, &ka, regs); goto done; } @@ -359,10 +310,7 @@ void do_signal(struct pt_regs *regs) } /* If there's no signal to deliver, just put the saved sigmask back. */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); done: /* Avoid double syscall restart if there are nested signals. */ @@ -373,14 +321,13 @@ int show_unhandled_signals = 1; static int __init crashinfo(char *str) { - unsigned long val; const char *word; if (*str == '\0') - val = 2; - else if (*str != '=' || strict_strtoul(++str, 0, &val) != 0) + show_unhandled_signals = 2; + else if (*str != '=' || kstrtoint(++str, 0, &show_unhandled_signals) != 0) return 0; - show_unhandled_signals = val; + switch (show_unhandled_signals) { case 0: word = "No"; diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c index b7a87950408..de07fa7d131 100644 --- a/arch/tile/kernel/single_step.c +++ b/arch/tile/kernel/single_step.c @@ -12,40 +12,30 @@ * more details. * * A code-rewriter that enables instruction single-stepping. - * Derived from iLib's single-stepping code. */ -#ifndef __tilegx__ /* Hardware support for single step unavailable. */ - -/* These functions are only used on the TILE platform */ +#include <linux/smp.h> +#include <linux/ptrace.h> #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/mman.h> #include <linux/types.h> #include <linux/err.h> +#include <linux/prctl.h> #include <asm/cacheflush.h> +#include <asm/traps.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> #include <arch/abi.h> +#include <arch/spr_def.h> #include <arch/opcode.h> -#define signExtend17(val) sign_extend((val), 17) -#define TILE_X1_MASK (0xffffffffULL << 31) - -int unaligned_printk; -static int __init setup_unaligned_printk(char *str) -{ - long val; - if (strict_strtol(str, 0, &val) != 0) - return 0; - unaligned_printk = val; - pr_info("Printk for each unaligned data accesses is %s\n", - unaligned_printk ? "enabled" : "disabled"); - return 1; -} -__setup("unaligned_printk=", setup_unaligned_printk); +#ifndef __tilegx__ /* Hardware support for single step unavailable. */ -unsigned int unaligned_fixup_count; +#define signExtend17(val) sign_extend((val), 17) +#define TILE_X1_MASK (0xffffffffULL << 31) enum mem_op { MEMOP_NONE, @@ -55,12 +45,13 @@ enum mem_op { MEMOP_STORE_POSTINCR }; -static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset) +static inline tilepro_bundle_bits set_BrOff_X1(tilepro_bundle_bits n, + s32 offset) { - tile_bundle_bits result; + tilepro_bundle_bits result; /* mask out the old offset */ - tile_bundle_bits mask = create_BrOff_X1(-1); + tilepro_bundle_bits mask = create_BrOff_X1(-1); result = n & (~mask); /* or in the new offset */ @@ -69,10 +60,11 @@ static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset) return result; } -static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src) +static inline tilepro_bundle_bits move_X1(tilepro_bundle_bits n, int dest, + int src) { - tile_bundle_bits result; - tile_bundle_bits op; + tilepro_bundle_bits result; + tilepro_bundle_bits op; result = n & (~TILE_X1_MASK); @@ -86,13 +78,13 @@ static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src) return result; } -static inline tile_bundle_bits nop_X1(tile_bundle_bits n) +static inline tilepro_bundle_bits nop_X1(tilepro_bundle_bits n) { return move_X1(n, TREG_ZERO, TREG_ZERO); } -static inline tile_bundle_bits addi_X1( - tile_bundle_bits n, int dest, int src, int imm) +static inline tilepro_bundle_bits addi_X1( + tilepro_bundle_bits n, int dest, int src, int imm) { n &= ~TILE_X1_MASK; @@ -106,15 +98,26 @@ static inline tile_bundle_bits addi_X1( return n; } -static tile_bundle_bits rewrite_load_store_unaligned( +static tilepro_bundle_bits rewrite_load_store_unaligned( struct single_step_state *state, - tile_bundle_bits bundle, + tilepro_bundle_bits bundle, struct pt_regs *regs, enum mem_op mem_op, int size, int sign_ext) { unsigned char __user *addr; int val_reg, addr_reg, err, val; + int align_ctl; + + align_ctl = unaligned_fixup; + switch (task_thread_info(current)->align_ctl) { + case PR_UNALIGN_NOPRINT: + align_ctl = 1; + break; + case PR_UNALIGN_SIGBUS: + align_ctl = 0; + break; + } /* Get address and value registers */ if (bundle & TILEPRO_BUNDLE_Y_ENCODING_MASK) { @@ -152,9 +155,25 @@ static tile_bundle_bits rewrite_load_store_unaligned( if (((unsigned long)addr % size) == 0) return bundle; -#ifndef __LITTLE_ENDIAN -# error We assume little-endian representation with copy_xx_user size 2 here -#endif + /* + * Return SIGBUS with the unaligned address, if requested. + * Note that we return SIGBUS even for completely invalid addresses + * as long as they are in fact unaligned; this matches what the + * tilepro hardware would be doing, if it could provide us with the + * actual bad address in an SPR, which it doesn't. + */ + if (align_ctl == 0) { + siginfo_t info = { + .si_signo = SIGBUS, + .si_code = BUS_ADRALN, + .si_addr = addr + }; + trace_unhandled_signal("unaligned trap", regs, + (unsigned long)addr, SIGBUS); + force_sig_info(info.si_signo, &info, current); + return (tilepro_bundle_bits) 0; + } + /* Handle unaligned load/store */ if (mem_op == MEMOP_LOAD || mem_op == MEMOP_LOAD_POSTINCR) { unsigned short val_16; @@ -175,32 +194,31 @@ static tile_bundle_bits rewrite_load_store_unaligned( state->update = 1; } } else { + unsigned short val_16; val = (val_reg == TREG_ZERO) ? 0 : regs->regs[val_reg]; - err = copy_to_user(addr, &val, size); + switch (size) { + case 2: + val_16 = val; + err = copy_to_user(addr, &val_16, sizeof(val_16)); + break; + case 4: + err = copy_to_user(addr, &val, sizeof(val)); + break; + default: + BUG(); + } } if (err) { siginfo_t info = { - .si_signo = SIGSEGV, - .si_code = SEGV_MAPERR, - .si_addr = addr - }; - trace_unhandled_signal("segfault", regs, - (unsigned long)addr, SIGSEGV); - force_sig_info(info.si_signo, &info, current); - return (tile_bundle_bits) 0; - } - - if (unaligned_fixup == 0) { - siginfo_t info = { .si_signo = SIGBUS, .si_code = BUS_ADRALN, .si_addr = addr }; - trace_unhandled_signal("unaligned trap", regs, + trace_unhandled_signal("bad address for unaligned fixup", regs, (unsigned long)addr, SIGBUS); force_sig_info(info.si_signo, &info, current); - return (tile_bundle_bits) 0; + return (tilepro_bundle_bits) 0; } if (unaligned_printk || unaligned_fixup_count == 0) { @@ -269,7 +287,7 @@ void single_step_execve(void) ti->step_state = NULL; } -/** +/* * single_step_once() - entry point when single stepping has been triggered. * @regs: The machine register state * @@ -288,20 +306,31 @@ void single_step_execve(void) */ void single_step_once(struct pt_regs *regs) { - extern tile_bundle_bits __single_step_ill_insn; - extern tile_bundle_bits __single_step_j_insn; - extern tile_bundle_bits __single_step_addli_insn; - extern tile_bundle_bits __single_step_auli_insn; + extern tilepro_bundle_bits __single_step_ill_insn; + extern tilepro_bundle_bits __single_step_j_insn; + extern tilepro_bundle_bits __single_step_addli_insn; + extern tilepro_bundle_bits __single_step_auli_insn; struct thread_info *info = (void *)current_thread_info(); struct single_step_state *state = info->step_state; int is_single_step = test_ti_thread_flag(info, TIF_SINGLESTEP); - tile_bundle_bits __user *buffer, *pc; - tile_bundle_bits bundle; + tilepro_bundle_bits __user *buffer, *pc; + tilepro_bundle_bits bundle; int temp_reg; int target_reg = TREG_LR; int err; enum mem_op mem_op = MEMOP_NONE; int size = 0, sign_ext = 0; /* happy compiler */ + int align_ctl; + + align_ctl = unaligned_fixup; + switch (task_thread_info(current)->align_ctl) { + case PR_UNALIGN_NOPRINT: + align_ctl = 1; + break; + case PR_UNALIGN_SIGBUS: + align_ctl = 0; + break; + } asm( " .pushsection .rodata.single_step\n" @@ -338,12 +367,10 @@ void single_step_once(struct pt_regs *regs) } /* allocate a cache line of writable, executable memory */ - down_write(¤t->mm->mmap_sem); - buffer = (void __user *) do_mmap(NULL, 0, 64, + buffer = (void __user *) vm_mmap(NULL, 0, 64, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0); - up_write(¤t->mm->mmap_sem); if (IS_ERR((void __force *)buffer)) { kfree(state); @@ -376,7 +403,7 @@ void single_step_once(struct pt_regs *regs) if (regs->faultnum == INT_SWINT_1) regs->pc -= 8; - pc = (tile_bundle_bits __user *)(regs->pc); + pc = (tilepro_bundle_bits __user *)(regs->pc); if (get_user(bundle, pc) != 0) { pr_err("Couldn't read instruction at %p trying to step\n", pc); return; @@ -519,7 +546,6 @@ void single_step_once(struct pt_regs *regs) } break; -#if CHIP_HAS_WH64() /* postincrement operations */ case IMM_0_OPCODE_X1: switch (get_ImmOpcodeExtension_X1(bundle)) { @@ -554,7 +580,6 @@ void single_step_once(struct pt_regs *regs) break; } break; -#endif /* CHIP_HAS_WH64() */ } if (state->update) { @@ -613,9 +638,9 @@ void single_step_once(struct pt_regs *regs) /* * Check if we need to rewrite an unaligned load/store. - * Returning zero is a special value meaning we need to SIGSEGV. + * Returning zero is a special value meaning we generated a signal. */ - if (mem_op != MEMOP_NONE && unaligned_fixup >= 0) { + if (mem_op != MEMOP_NONE && align_ctl >= 0) { bundle = rewrite_load_store_unaligned(state, bundle, regs, mem_op, size, sign_ext); if (bundle == 0) @@ -654,9 +679,9 @@ void single_step_once(struct pt_regs *regs) } /* End with a jump back to the next instruction */ - delta = ((regs->pc + TILE_BUNDLE_SIZE_IN_BYTES) - + delta = ((regs->pc + TILEPRO_BUNDLE_SIZE_IN_BYTES) - (unsigned long)buffer) >> - TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES; + TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES; bundle = __single_step_j_insn; bundle |= create_JOffLong_X1(delta); err |= __put_user(bundle, buffer++); @@ -684,9 +709,6 @@ void single_step_once(struct pt_regs *regs) } #else -#include <linux/smp.h> -#include <linux/ptrace.h> -#include <arch/spr_def.h> static DEFINE_PER_CPU(unsigned long, ss_saved_pc); @@ -729,10 +751,10 @@ void gx_singlestep_handle(struct pt_regs *regs, int fault_num) } else if ((*ss_pc != regs->pc) || (!(control & SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK))) { - ptrace_notify(SIGTRAP); control |= SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK; control |= SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK; __insn_mtspr(SPR_SINGLE_STEP_CONTROL_K, control); + send_sigtrap(current, regs); } } diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index c52224d5ed4..01e8ab29f43 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -20,8 +20,13 @@ #include <linux/irq.h> #include <linux/module.h> #include <asm/cacheflush.h> +#include <asm/homecache.h> -HV_Topology smp_topology __write_once; +/* + * We write to width and height with a single store in head_NN.S, + * so make the variable aligned to "long". + */ +HV_Topology smp_topology __write_once __aligned(sizeof(long)); EXPORT_SYMBOL(smp_topology); #if CHIP_HAS_IPI() @@ -87,25 +92,6 @@ void send_IPI_allbutself(int tag) send_IPI_many(&mask, tag); } - -/* - * Provide smp_call_function_mask, but also run function locally - * if specified in the mask. - */ -void on_each_cpu_mask(const struct cpumask *mask, void (*func)(void *), - void *info, bool wait) -{ - int cpu = get_cpu(); - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); - func(info); - local_irq_enable(); - } - put_cpu(); -} - - /* * Functions related to starting/stopping cpus. */ @@ -119,10 +105,10 @@ static void smp_start_cpu_interrupt(void) /* Handler to stop the current cpu. */ static void smp_stop_cpu_interrupt(void) { - set_cpu_online(smp_processor_id(), 0); arch_local_irq_disable_all(); + set_cpu_online(smp_processor_id(), 0); for (;;) - asm("nap"); + asm("nap; nop"); } /* This function calls the 'stop' function on all other CPUs in the system. */ @@ -132,6 +118,12 @@ void smp_send_stop(void) send_IPI_allbutself(MSG_TAG_STOP_CPU); } +/* On panic, just wait; we may get an smp_send_stop() later on. */ +void panic_smp_self_stop(void) +{ + while (1) + asm("nap; nop"); +} /* * Dispatch code called from hv_message_intr() for HV_MSG_TILE hv messages. @@ -180,9 +172,16 @@ static void ipi_flush_icache_range(void *info) void flush_icache_range(unsigned long start, unsigned long end) { struct ipi_flush flush = { start, end }; - preempt_disable(); - on_each_cpu(ipi_flush_icache_range, &flush, 1); - preempt_enable(); + + /* If invoked with irqs disabled, we can not issue IPIs. */ + if (irqs_disabled()) + flush_remote(0, HV_FLUSH_EVICT_L1I, NULL, 0, 0, 0, + NULL, NULL, 0); + else { + preempt_disable(); + on_each_cpu(ipi_flush_icache_range, &flush, 1); + preempt_enable(); + } } @@ -216,7 +215,7 @@ void __init ipi_init(void) if (hv_get_ipi_pte(tile, KERNEL_PL, &pte) != 0) panic("Failed to initialize IPI for cpu %d\n", cpu); - offset = hv_pte_get_pfn(pte) << PAGE_SHIFT; + offset = PFN_PHYS(pte_pfn(pte)); ipi_mappings[cpu] = ioremap_prot(offset, PAGE_SIZE, pte); } #endif diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c index b949edcec20..732e9d13866 100644 --- a/arch/tile/kernel/smpboot.c +++ b/arch/tile/kernel/smpboot.c @@ -133,22 +133,24 @@ static __init int reset_init_affinity(void) } late_initcall(reset_init_affinity); -static struct cpumask cpu_started __cpuinitdata; +static struct cpumask cpu_started; /* * Activate a secondary processor. Very minimal; don't add anything * to this path without knowing what you're doing, since SMP booting * is pretty fragile. */ -static void __cpuinit start_secondary(void) +static void start_secondary(void) { - int cpuid = smp_processor_id(); + int cpuid; + + preempt_disable(); + + cpuid = smp_processor_id(); /* Set our thread pointer appropriately. */ set_my_cpu_offset(__per_cpu_offset[cpuid]); - preempt_disable(); - /* * In large machines even this will slow us down, since we * will be contending for for the printk spinlock. @@ -183,7 +185,7 @@ static void __cpuinit start_secondary(void) /* * Bring a secondary processor online. */ -void __cpuinit online_secondary(void) +void online_secondary(void) { /* * low-memory mappings have been cleared, flush them from @@ -196,17 +198,9 @@ void __cpuinit online_secondary(void) /* This must be done before setting cpu_online_mask */ wmb(); - /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - */ - ipi_call_lock(); + notify_cpu_starting(smp_processor_id()); + set_cpu_online(smp_processor_id(), 1); - ipi_call_unlock(); __get_cpu_var(cpu_state) = CPU_ONLINE; /* Set up tile-specific state for this cpu. */ @@ -215,12 +209,10 @@ void __cpuinit online_secondary(void) /* Set up tile-timer clock-event device on this cpu */ setup_tile_timer(); - preempt_enable(); - - cpu_idle(); + cpu_startup_entry(CPUHP_ONLINE); } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { /* Wait 5s total for all CPUs for them to come online */ static int timeout; diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c index 37ee4d037e0..c93977a6211 100644 --- a/arch/tile/kernel/stack.c +++ b/arch/tile/kernel/stack.c @@ -21,12 +21,16 @@ #include <linux/stacktrace.h> #include <linux/uaccess.h> #include <linux/mmzone.h> +#include <linux/dcache.h> +#include <linux/fs.h> +#include <linux/string.h> #include <asm/backtrace.h> #include <asm/page.h> -#include <asm/tlbflush.h> #include <asm/ucontext.h> +#include <asm/switch_to.h> #include <asm/sigframe.h> #include <asm/stack.h> +#include <asm/vdso.h> #include <arch/abi.h> #include <arch/interrupts.h> @@ -44,72 +48,23 @@ static int in_kernel_stack(struct KBacktraceIterator *kbt, unsigned long sp) return sp >= kstack_base && sp < kstack_base + THREAD_SIZE; } -/* Is address valid for reading? */ -static int valid_address(struct KBacktraceIterator *kbt, unsigned long address) -{ - HV_PTE *l1_pgtable = kbt->pgtable; - HV_PTE *l2_pgtable; - unsigned long pfn; - HV_PTE pte; - struct page *page; - - if (l1_pgtable == NULL) - return 0; /* can't read user space in other tasks */ - -#ifdef CONFIG_64BIT - /* Find the real l1_pgtable by looking in the l0_pgtable. */ - pte = l1_pgtable[HV_L0_INDEX(address)]; - if (!hv_pte_get_present(pte)) - return 0; - pfn = hv_pte_get_pfn(pte); - if (pte_huge(pte)) { - if (!pfn_valid(pfn)) { - pr_err("L0 huge page has bad pfn %#lx\n", pfn); - return 0; - } - return hv_pte_get_present(pte) && hv_pte_get_readable(pte); - } - page = pfn_to_page(pfn); - BUG_ON(PageHighMem(page)); /* No HIGHMEM on 64-bit. */ - l1_pgtable = (HV_PTE *)pfn_to_kaddr(pfn); -#endif - pte = l1_pgtable[HV_L1_INDEX(address)]; - if (!hv_pte_get_present(pte)) - return 0; - pfn = hv_pte_get_pfn(pte); - if (pte_huge(pte)) { - if (!pfn_valid(pfn)) { - pr_err("huge page has bad pfn %#lx\n", pfn); - return 0; - } - return hv_pte_get_present(pte) && hv_pte_get_readable(pte); - } - - page = pfn_to_page(pfn); - if (PageHighMem(page)) { - pr_err("L2 page table not in LOWMEM (%#llx)\n", - HV_PFN_TO_CPA(pfn)); - return 0; - } - l2_pgtable = (HV_PTE *)pfn_to_kaddr(pfn); - pte = l2_pgtable[HV_L2_INDEX(address)]; - return hv_pte_get_present(pte) && hv_pte_get_readable(pte); -} - /* Callback for backtracer; basically a glorified memcpy */ static bool read_memory_func(void *result, unsigned long address, unsigned int size, void *vkbt) { int retval; struct KBacktraceIterator *kbt = (struct KBacktraceIterator *)vkbt; + + if (address == 0) + return 0; if (__kernel_text_address(address)) { /* OK to read kernel code. */ } else if (address >= PAGE_OFFSET) { /* We only tolerate kernel-space reads of this task's stack */ if (!in_kernel_stack(kbt, address)) return 0; - } else if (!valid_address(kbt, address)) { - return 0; /* invalid user-space address */ + } else if (!kbt->is_current) { + return 0; /* can't read from other user address spaces */ } pagefault_disable(); retval = __copy_from_user_inatomic(result, @@ -127,6 +82,8 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt) unsigned long sp = kbt->it.sp; struct pt_regs *p; + if (sp % sizeof(long) != 0) + return NULL; if (!in_kernel_stack(kbt, sp)) return NULL; if (!in_kernel_stack(kbt, sp + C_ABI_SAVE_AREA_SIZE + PTREGS_SIZE-1)) @@ -147,9 +104,8 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt) p->sp >= sp) { if (kbt->verbose) pr_err(" <%s while in kernel mode>\n", fault); - } else if (EX1_PL(p->ex1) == USER_PL && - p->pc < PAGE_OFFSET && - p->sp < PAGE_OFFSET) { + } else if (user_mode(p) && + p->sp < PAGE_OFFSET && p->sp != 0) { if (kbt->verbose) pr_err(" <%s while in user mode>\n", fault); } else if (kbt->verbose) { @@ -157,7 +113,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt) p->pc, p->sp, p->ex1); p = NULL; } - if (!kbt->profile || (INT_MASK(p->faultnum) & QUEUED_INTERRUPTS) == 0) + if (!kbt->profile || ((1ULL << p->faultnum) & QUEUED_INTERRUPTS) == 0) return p; return NULL; } @@ -165,31 +121,31 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt) /* Is the pc pointing to a sigreturn trampoline? */ static int is_sigreturn(unsigned long pc) { - return (pc == VDSO_BASE); + return current->mm && (pc == VDSO_SYM(&__vdso_rt_sigreturn)); } /* Return a pt_regs pointer for a valid signal handler frame */ -static struct pt_regs *valid_sigframe(struct KBacktraceIterator* kbt) +static struct pt_regs *valid_sigframe(struct KBacktraceIterator* kbt, + struct rt_sigframe* kframe) { BacktraceIterator *b = &kbt->it; - if (b->pc == VDSO_BASE) { - struct rt_sigframe *frame; - unsigned long sigframe_top = - b->sp + sizeof(struct rt_sigframe) - 1; - if (!valid_address(kbt, b->sp) || - !valid_address(kbt, sigframe_top)) { - if (kbt->verbose) - pr_err(" (odd signal: sp %#lx?)\n", - (unsigned long)(b->sp)); + if (is_sigreturn(b->pc) && b->sp < PAGE_OFFSET && + b->sp % sizeof(long) == 0) { + int retval; + pagefault_disable(); + retval = __copy_from_user_inatomic( + kframe, (void __user __force *)b->sp, + sizeof(*kframe)); + pagefault_enable(); + if (retval != 0 || + (unsigned int)(kframe->info.si_signo) >= _NSIG) return NULL; - } - frame = (struct rt_sigframe *)b->sp; if (kbt->verbose) { pr_err(" <received signal %d>\n", - frame->info.si_signo); + kframe->info.si_signo); } - return (struct pt_regs *)&frame->uc.uc_mcontext; + return (struct pt_regs *)&kframe->uc.uc_mcontext; } return NULL; } @@ -202,10 +158,11 @@ static int KBacktraceIterator_is_sigreturn(struct KBacktraceIterator *kbt) static int KBacktraceIterator_restart(struct KBacktraceIterator *kbt) { struct pt_regs *p; + struct rt_sigframe kframe; p = valid_fault_handler(kbt); if (p == NULL) - p = valid_sigframe(kbt); + p = valid_sigframe(kbt, &kframe); if (p == NULL) return 0; backtrace_init(&kbt->it, read_memory_func, kbt, @@ -239,21 +196,21 @@ static int KBacktraceIterator_next_item_inclusive( */ static void validate_stack(struct pt_regs *regs) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); unsigned long ksp0 = get_current_ksp0(); - unsigned long ksp0_base = ksp0 - THREAD_SIZE; + unsigned long ksp0_base = ksp0 & -THREAD_SIZE; unsigned long sp = stack_pointer; if (EX1_PL(regs->ex1) == KERNEL_PL && regs->sp >= ksp0) { - pr_err("WARNING: cpu %d: kernel stack page %#lx underrun!\n" + pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx underrun!\n" " sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n", - cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr); + cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr); } else if (sp < ksp0_base + sizeof(struct thread_info)) { - pr_err("WARNING: cpu %d: kernel stack page %#lx overrun!\n" + pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx overrun!\n" " sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n", - cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr); + cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr); } } @@ -265,41 +222,19 @@ void KBacktraceIterator_init(struct KBacktraceIterator *kbt, /* * Set up callback information. We grab the kernel stack base - * so we will allow reads of that address range, and if we're - * asking about the current process we grab the page table - * so we can check user accesses before trying to read them. - * We flush the TLB to avoid any weird skew issues. + * so we will allow reads of that address range. */ - is_current = (t == NULL); + is_current = (t == NULL || t == current); kbt->is_current = is_current; if (is_current) t = validate_current(); kbt->task = t; - kbt->pgtable = NULL; kbt->verbose = 0; /* override in caller if desired */ kbt->profile = 0; /* override in caller if desired */ kbt->end = KBT_ONGOING; - kbt->new_context = 0; - if (is_current) { - HV_PhysAddr pgdir_pa = hv_inquire_context().page_table; - if (pgdir_pa == (unsigned long)swapper_pg_dir - PAGE_OFFSET) { - /* - * Not just an optimization: this also allows - * this to work at all before va/pa mappings - * are set up. - */ - kbt->pgtable = swapper_pg_dir; - } else { - struct page *page = pfn_to_page(PFN_DOWN(pgdir_pa)); - if (!PageHighMem(page)) - kbt->pgtable = __va(pgdir_pa); - else - pr_err("page table not in LOWMEM" - " (%#llx)\n", pgdir_pa); - } - local_flush_tlb_all(); + kbt->new_context = 1; + if (is_current) validate_stack(regs); - } if (regs == NULL) { if (is_current || t->state == TASK_RUNNING) { @@ -345,6 +280,95 @@ void KBacktraceIterator_next(struct KBacktraceIterator *kbt) } EXPORT_SYMBOL(KBacktraceIterator_next); +static void describe_addr(struct KBacktraceIterator *kbt, + unsigned long address, + int have_mmap_sem, char *buf, size_t bufsize) +{ + struct vm_area_struct *vma; + size_t namelen, remaining; + unsigned long size, offset, adjust; + char *p, *modname; + const char *name; + int rc; + + /* + * Look one byte back for every caller frame (i.e. those that + * aren't a new context) so we look up symbol data for the + * call itself, not the following instruction, which may be on + * a different line (or in a different function). + */ + adjust = !kbt->new_context; + address -= adjust; + + if (address >= PAGE_OFFSET) { + /* Handle kernel symbols. */ + BUG_ON(bufsize < KSYM_NAME_LEN); + name = kallsyms_lookup(address, &size, &offset, + &modname, buf); + if (name == NULL) { + buf[0] = '\0'; + return; + } + namelen = strlen(buf); + remaining = (bufsize - 1) - namelen; + p = buf + namelen; + rc = snprintf(p, remaining, "+%#lx/%#lx ", + offset + adjust, size); + if (modname && rc < remaining) + snprintf(p + rc, remaining - rc, "[%s] ", modname); + buf[bufsize-1] = '\0'; + return; + } + + /* If we don't have the mmap_sem, we can't show any more info. */ + buf[0] = '\0'; + if (!have_mmap_sem) + return; + + /* Find vma info. */ + vma = find_vma(kbt->task->mm, address); + if (vma == NULL || address < vma->vm_start) { + snprintf(buf, bufsize, "[unmapped address] "); + return; + } + + if (vma->vm_file) { + p = d_path(&vma->vm_file->f_path, buf, bufsize); + if (IS_ERR(p)) + p = "?"; + name = kbasename(p); + } else { + name = "anon"; + } + + /* Generate a string description of the vma info. */ + namelen = strlen(name); + remaining = (bufsize - 1) - namelen; + memmove(buf, name, namelen); + snprintf(buf + namelen, remaining, "[%lx+%lx] ", + vma->vm_start, vma->vm_end - vma->vm_start); +} + +/* + * Avoid possible crash recursion during backtrace. If it happens, it + * makes it easy to lose the actual root cause of the failure, so we + * put a simple guard on all the backtrace loops. + */ +static bool start_backtrace(void) +{ + if (current->thread.in_backtrace) { + pr_err("Backtrace requested while in backtrace!\n"); + return false; + } + current->thread.in_backtrace = true; + return true; +} + +static void end_backtrace(void) +{ + current->thread.in_backtrace = false; +} + /* * This method wraps the backtracer's more generic support. * It is only invoked from the architecture-specific code; show_stack() @@ -353,7 +377,10 @@ EXPORT_SYMBOL(KBacktraceIterator_next); void tile_show_stack(struct KBacktraceIterator *kbt, int headers) { int i; + int have_mmap_sem = 0; + if (!start_backtrace()) + return; if (headers) { /* * Add a blank line since if we are called from panic(), @@ -364,36 +391,21 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers) pr_err("Starting stack dump of tid %d, pid %d (%s)" " on cpu %d at cycle %lld\n", kbt->task->pid, kbt->task->tgid, kbt->task->comm, - smp_processor_id(), get_cycles()); + raw_smp_processor_id(), get_cycles()); } kbt->verbose = 1; i = 0; for (; !KBacktraceIterator_end(kbt); KBacktraceIterator_next(kbt)) { - char *modname; - const char *name; - unsigned long address = kbt->it.pc; - unsigned long offset, size; char namebuf[KSYM_NAME_LEN+100]; + unsigned long address = kbt->it.pc; - if (address >= PAGE_OFFSET) - name = kallsyms_lookup(address, &size, &offset, - &modname, namebuf); - else - name = NULL; - - if (!name) - namebuf[0] = '\0'; - else { - size_t namelen = strlen(namebuf); - size_t remaining = (sizeof(namebuf) - 1) - namelen; - char *p = namebuf + namelen; - int rc = snprintf(p, remaining, "+%#lx/%#lx ", - offset, size); - if (modname && rc < remaining) - snprintf(p + rc, remaining - rc, - "[%s] ", modname); - namebuf[sizeof(namebuf)-1] = '\0'; - } + /* Try to acquire the mmap_sem as we pass into userspace. */ + if (address < PAGE_OFFSET && !have_mmap_sem && kbt->task->mm) + have_mmap_sem = + down_read_trylock(&kbt->task->mm->mmap_sem); + + describe_addr(kbt, address, have_mmap_sem, + namebuf, sizeof(namebuf)); pr_err(" frame %d: 0x%lx %s(sp 0x%lx)\n", i++, address, namebuf, (unsigned long)(kbt->it.sp)); @@ -408,6 +420,9 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers) pr_err("Stack dump stopped; next frame identical to this one\n"); if (headers) pr_err("Stack dump complete\n"); + if (have_mmap_sem) + up_read(&kbt->task->mm->mmap_sem); + end_backtrace(); } EXPORT_SYMBOL(tile_show_stack); @@ -448,7 +463,7 @@ void _KBacktraceIterator_init_current(struct KBacktraceIterator *kbt, ulong pc, regs_to_pt_regs(®s, pc, lr, sp, r52)); } -/* This is called only from kernel/sched.c, with esp == NULL */ +/* This is called only from kernel/sched/core.c, with esp == NULL */ void show_stack(struct task_struct *task, unsigned long *esp) { struct KBacktraceIterator kbt; @@ -469,6 +484,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace) int skip = trace->skip; int i = 0; + if (!start_backtrace()) + goto done; if (task == NULL || task == current) KBacktraceIterator_init_current(&kbt); else @@ -482,6 +499,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace) break; trace->entries[i++] = kbt.it.pc; } + end_backtrace(); +done: trace->nr_entries = i; } EXPORT_SYMBOL(save_stack_trace_tsk); @@ -490,6 +509,7 @@ void save_stack_trace(struct stack_trace *trace) { save_stack_trace_tsk(NULL, trace); } +EXPORT_SYMBOL_GPL(save_stack_trace); #endif diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c index cb44ba7ccd2..38debe70606 100644 --- a/arch/tile/kernel/sys.c +++ b/arch/tile/kernel/sys.c @@ -32,11 +32,19 @@ #include <asm/syscalls.h> #include <asm/pgtable.h> #include <asm/homecache.h> +#include <asm/cachectl.h> #include <arch/chip.h> -SYSCALL_DEFINE0(flush_cache) +SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, len, + unsigned long, flags) { - homecache_evict(cpumask_of(smp_processor_id())); + /* DCACHE is not particularly effective if not bound to one cpu. */ + if (flags & DCACHE) + homecache_evict(cpumask_of(raw_smp_processor_id())); + + if (flags & ICACHE) + flush_remote(0, HV_FLUSH_EVICT_L1I, mm_cpumask(current->mm), + 0, 0, 0, NULL, NULL, 0); return 0; } @@ -100,14 +108,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, #define sys_readahead sys32_readahead #endif -/* Call the trampolines to manage pt_regs where necessary. */ -#define sys_execve _sys_execve -#define sys_sigaltstack _sys_sigaltstack +/* Call the assembly trampolines where necessary. */ +#undef sys_rt_sigreturn #define sys_rt_sigreturn _sys_rt_sigreturn #define sys_clone _sys_clone -#ifndef __tilegx__ -#define sys_cmpxchg_badaddr _sys_cmpxchg_badaddr -#endif /* * Note that we can't include <linux/unistd.h> here since the header diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c index f862b005eb7..a3ed12f8f83 100644 --- a/arch/tile/kernel/sysfs.c +++ b/arch/tile/kernel/sysfs.c @@ -93,6 +93,10 @@ HV_CONF_ATTR(mezz_part, HV_CONFSTR_MEZZ_PART_NUM) HV_CONF_ATTR(mezz_serial, HV_CONFSTR_MEZZ_SERIAL_NUM) HV_CONF_ATTR(mezz_revision, HV_CONFSTR_MEZZ_REV) HV_CONF_ATTR(mezz_description, HV_CONFSTR_MEZZ_DESC) +HV_CONF_ATTR(cpumod_part, HV_CONFSTR_CPUMOD_PART_NUM) +HV_CONF_ATTR(cpumod_serial, HV_CONFSTR_CPUMOD_SERIAL_NUM) +HV_CONF_ATTR(cpumod_revision, HV_CONFSTR_CPUMOD_REV) +HV_CONF_ATTR(cpumod_description,HV_CONFSTR_CPUMOD_DESC) HV_CONF_ATTR(switch_control, HV_CONFSTR_SWITCH_CONTROL) static struct attribute *board_attrs[] = { @@ -104,6 +108,10 @@ static struct attribute *board_attrs[] = { &dev_attr_mezz_serial.attr, &dev_attr_mezz_revision.attr, &dev_attr_mezz_description.attr, + &dev_attr_cpumod_part.attr, + &dev_attr_cpumod_serial.attr, + &dev_attr_cpumod_revision.attr, + &dev_attr_cpumod_description.attr, &dev_attr_switch_control.attr, NULL }; @@ -149,6 +157,67 @@ hvconfig_bin_read(struct file *filp, struct kobject *kobj, return count; } +static ssize_t hv_stats_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + int cpu = dev->id; + long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu)); + + ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS, + (unsigned long)page, PAGE_SIZE - 1, + lotar, 0); + n = n < 0 ? 0 : min(n, (ssize_t)PAGE_SIZE - 1); + page[n] = '\0'; + return n; +} + +static ssize_t hv_stats_store(struct device *dev, + struct device_attribute *attr, + const char *page, + size_t count) +{ + int cpu = dev->id; + long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu)); + + ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS, 0, 0, lotar, 1); + return n < 0 ? n : count; +} + +static DEVICE_ATTR(hv_stats, 0644, hv_stats_show, hv_stats_store); + +static int hv_stats_device_add(struct device *dev, struct subsys_interface *sif) +{ + int err, cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + err = sysfs_create_file(&dev->kobj, &dev_attr_hv_stats.attr); + + return err; +} + +static int hv_stats_device_remove(struct device *dev, + struct subsys_interface *sif) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr); + return 0; +} + + +static struct subsys_interface hv_stats_interface = { + .name = "hv_stats", + .subsys = &cpu_subsys, + .add_dev = hv_stats_device_add, + .remove_dev = hv_stats_device_remove, +}; + static int __init create_sysfs_entries(void) { int err = 0; @@ -163,7 +232,7 @@ static int __init create_sysfs_entries(void) #define create_hv_attr(name) \ if (!err) \ - err = sysfs_create_file(hypervisor_kobj, &dev_attr_##name); + err = sysfs_create_file(hypervisor_kobj, &dev_attr_##name.attr); create_hv_attr(type); create_hv_attr(version); create_hv_attr(config_version); @@ -180,6 +249,21 @@ static int __init create_sysfs_entries(void) err = sysfs_create_bin_file(hypervisor_kobj, &hvconfig_bin); } + if (!err) { + /* + * Don't bother adding the hv_stats files on each CPU if + * our hypervisor doesn't supply statistics. + */ + int cpu = raw_smp_processor_id(); + long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu)); + char dummy; + ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS, + (unsigned long) &dummy, 1, + lotar, 0); + if (n >= 0) + err = subsys_interface_register(&hv_stats_interface); + } + return err; } subsys_initcall(create_sysfs_entries); diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c index f6f50f2a5e3..462dcd0c170 100644 --- a/arch/tile/kernel/time.c +++ b/arch/tile/kernel/time.c @@ -23,8 +23,10 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/timekeeper_internal.h> #include <asm/irq_regs.h> #include <asm/traps.h> +#include <asm/vdso.h> #include <hv/hypervisor.h> #include <arch/interrupts.h> #include <arch/spr_def.h> @@ -110,7 +112,6 @@ void __init time_init(void) setup_tile_timer(); } - /* * Define the tile timer clock event device. The timer is driven by * the TILE_TIMER_CONTROL register, which consists of a 31-bit down @@ -159,7 +160,7 @@ static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = { .set_mode = tile_timer_set_mode, }; -void __cpuinit setup_tile_timer(void) +void setup_tile_timer(void) { struct clock_event_device *evt = &__get_cpu_var(tile_timer); @@ -230,6 +231,52 @@ int setup_profiling_timer(unsigned int multiplier) */ cycles_t ns2cycles(unsigned long nsecs) { - struct clock_event_device *dev = &__get_cpu_var(tile_timer); - return ((u64)nsecs * dev->mult) >> dev->shift; + /* + * We do not have to disable preemption here as each core has the same + * clock frequency. + */ + struct clock_event_device *dev = &__raw_get_cpu_var(tile_timer); + + /* + * as in clocksource.h and x86's timer.h, we split the calculation + * into 2 parts to avoid unecessary overflow of the intermediate + * value. This will not lead to any loss of precision. + */ + u64 quot = (u64)nsecs >> dev->shift; + u64 rem = (u64)nsecs & ((1ULL << dev->shift) - 1); + return quot * dev->mult + ((rem * dev->mult) >> dev->shift); +} + +void update_vsyscall_tz(void) +{ + /* Userspace gettimeofday will spin while this value is odd. */ + ++vdso_data->tz_update_count; + smp_wmb(); + vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; + vdso_data->tz_dsttime = sys_tz.tz_dsttime; + smp_wmb(); + ++vdso_data->tz_update_count; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct timespec wall_time = tk_xtime(tk); + struct timespec *wtm = &tk->wall_to_monotonic; + struct clocksource *clock = tk->clock; + + if (clock != &cycle_counter_cs) + return; + + /* Userspace gettimeofday will spin while this value is odd. */ + ++vdso_data->tb_update_count; + smp_wmb(); + vdso_data->xtime_tod_stamp = clock->cycle_last; + vdso_data->xtime_clock_sec = wall_time.tv_sec; + vdso_data->xtime_clock_nsec = wall_time.tv_nsec; + vdso_data->wtom_clock_sec = wtm->tv_sec; + vdso_data->wtom_clock_nsec = wtm->tv_nsec; + vdso_data->mult = clock->mult; + vdso_data->shift = clock->shift; + smp_wmb(); + ++vdso_data->tb_update_count; } diff --git a/arch/tile/kernel/tlb.c b/arch/tile/kernel/tlb.c index a5f241c24ca..f23b5351567 100644 --- a/arch/tile/kernel/tlb.c +++ b/arch/tile/kernel/tlb.c @@ -15,6 +15,7 @@ #include <linux/cpumask.h> #include <linux/module.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> #include <asm/homecache.h> #include <hv/hypervisor.h> @@ -49,25 +50,25 @@ void flush_tlb_current_task(void) flush_tlb_mm(current->mm); } -void flush_tlb_page_mm(const struct vm_area_struct *vma, struct mm_struct *mm, +void flush_tlb_page_mm(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long va) { - unsigned long size = hv_page_size(vma); + unsigned long size = vma_kernel_pagesize(vma); int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0; flush_remote(0, cache, mm_cpumask(mm), va, size, size, mm_cpumask(mm), NULL, 0); } -void flush_tlb_page(const struct vm_area_struct *vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { flush_tlb_page_mm(vma, vma->vm_mm, va); } EXPORT_SYMBOL(flush_tlb_page); -void flush_tlb_range(const struct vm_area_struct *vma, +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long size = hv_page_size(vma); + unsigned long size = vma_kernel_pagesize(vma); struct mm_struct *mm = vma->vm_mm; int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0; flush_remote(0, cache, mm_cpumask(mm), start, end - start, size, @@ -90,8 +91,14 @@ void flush_tlb_all(void) } } +/* + * Callers need to flush the L1I themselves if necessary, e.g. for + * kernel module unload. Otherwise we assume callers are not using + * executable pgprot_t's. Using EVICT_L1I means that dataplane cpus + * will get an unnecessary interrupt otherwise. + */ void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - flush_remote(0, HV_FLUSH_EVICT_L1I, cpu_online_mask, + flush_remote(0, 0, NULL, start, end - start, PAGE_SIZE, cpu_online_mask, NULL, 0); } diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index 4f47b8a356d..f3ceb6308e4 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -15,12 +15,14 @@ #include <linux/sched.h> #include <linux/kernel.h> #include <linux/kprobes.h> +#include <linux/kdebug.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/stack.h> #include <asm/traps.h> +#include <asm/setup.h> #include <arch/interrupts.h> #include <arch/spr_def.h> @@ -28,7 +30,7 @@ void __init trap_init(void) { - /* Nothing needed here since we link code at .intrpt1 */ + /* Nothing needed here since we link code at .intrpt */ } int unaligned_fixup = 1; @@ -40,10 +42,9 @@ static int __init setup_unaligned_fixup(char *str) * will still parse the instruction, then fire a SIGBUS with * the correct address from inside the single_step code. */ - long val; - if (strict_strtol(str, 0, &val) != 0) + if (kstrtoint(str, 0, &unaligned_fixup) != 0) return 0; - unaligned_fixup = val; + pr_info("Fixups for unaligned data accesses are %s\n", unaligned_fixup >= 0 ? (unaligned_fixup ? "enabled" : "disabled") : @@ -99,13 +100,7 @@ static int retry_gpv(unsigned int gpv_reason) #endif /* CHIP_HAS_TILE_DMA() */ -#ifdef __tilegx__ -#define bundle_bits tilegx_bundle_bits -#else -#define bundle_bits tile_bundle_bits -#endif - -extern bundle_bits bpt_code; +extern tile_bundle_bits bpt_code; asm(".pushsection .rodata.bpt_code,\"a\";" ".align 8;" @@ -113,7 +108,7 @@ asm(".pushsection .rodata.bpt_code,\"a\";" ".size bpt_code,.-bpt_code;" ".popsection"); -static int special_ill(bundle_bits bundle, int *sigp, int *codep) +static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep) { int sig, code, maxcode; @@ -194,34 +189,119 @@ static int special_ill(bundle_bits bundle, int *sigp, int *codep) return 1; } +static const char *const int_name[] = { + [INT_MEM_ERROR] = "Memory error", + [INT_ILL] = "Illegal instruction", + [INT_GPV] = "General protection violation", + [INT_UDN_ACCESS] = "UDN access", + [INT_IDN_ACCESS] = "IDN access", +#if CHIP_HAS_SN() + [INT_SN_ACCESS] = "SN access", +#endif + [INT_SWINT_3] = "Software interrupt 3", + [INT_SWINT_2] = "Software interrupt 2", + [INT_SWINT_0] = "Software interrupt 0", + [INT_UNALIGN_DATA] = "Unaligned data", + [INT_DOUBLE_FAULT] = "Double fault", +#ifdef __tilegx__ + [INT_ILL_TRANS] = "Illegal virtual address", +#endif +}; + +static int do_bpt(struct pt_regs *regs) +{ + unsigned long bundle, bcode, bpt; + + bundle = *(unsigned long *)instruction_pointer(regs); + + /* + * bpt shoule be { bpt; nop }, which is 0x286a44ae51485000ULL. + * we encode the unused least significant bits for other purpose. + */ + bpt = bundle & ~((1ULL << 12) - 1); + if (bpt != TILE_BPT_BUNDLE) + return 0; + + bcode = bundle & ((1ULL << 12) - 1); + /* + * notify the kprobe handlers, if instruction is likely to + * pertain to them. + */ + switch (bcode) { + /* breakpoint_insn */ + case 0: + notify_die(DIE_BREAK, "debug", regs, bundle, + INT_ILL, SIGTRAP); + break; + /* compiled_bpt */ + case DIE_COMPILED_BPT: + notify_die(DIE_COMPILED_BPT, "debug", regs, bundle, + INT_ILL, SIGTRAP); + break; + /* breakpoint2_insn */ + case DIE_SSTEPBP: + notify_die(DIE_SSTEPBP, "single_step", regs, bundle, + INT_ILL, SIGTRAP); + break; + default: + return 0; + } + + return 1; +} + void __kprobes do_trap(struct pt_regs *regs, int fault_num, unsigned long reason) { siginfo_t info = { 0 }; int signo, code; - unsigned long address; - bundle_bits instr; + unsigned long address = 0; + tile_bundle_bits instr; + int is_kernel = !user_mode(regs); + + /* Handle breakpoints, etc. */ + if (is_kernel && fault_num == INT_ILL && do_bpt(regs)) + return; - /* Re-enable interrupts. */ - local_irq_enable(); + /* Re-enable interrupts, if they were previously enabled. */ + if (!(regs->flags & PT_FLAGS_DISABLE_IRQ)) + local_irq_enable(); /* * If it hits in kernel mode and we can't fix it up, just exit the * current process and hope for the best. */ - if (!user_mode(regs)) { - if (fixup_exception(regs)) /* only UNALIGN_DATA in practice */ + if (is_kernel) { + const char *name; + char buf[100]; + if (fixup_exception(regs)) /* ILL_TRANS or UNALIGN_DATA */ return; - pr_alert("Kernel took bad trap %d at PC %#lx\n", - fault_num, regs->pc); + if (fault_num >= 0 && + fault_num < sizeof(int_name)/sizeof(int_name[0]) && + int_name[fault_num] != NULL) + name = int_name[fault_num]; + else + name = "Unknown interrupt"; if (fault_num == INT_GPV) - pr_alert("GPV_REASON is %#lx\n", reason); + snprintf(buf, sizeof(buf), "; GPV_REASON %#lx", reason); +#ifdef __tilegx__ + else if (fault_num == INT_ILL_TRANS) + snprintf(buf, sizeof(buf), "; address %#lx", reason); +#endif + else + buf[0] = '\0'; + pr_alert("Kernel took bad trap %d (%s) at PC %#lx%s\n", + fault_num, name, regs->pc, buf); show_regs(regs); do_exit(SIGKILL); /* FIXME: implement i386 die() */ return; } switch (fault_num) { + case INT_MEM_ERROR: + signo = SIGBUS; + code = BUS_OBJERR; + break; case INT_ILL: if (copy_from_user(&instr, (void __user *)regs->pc, sizeof(instr))) { @@ -288,14 +368,15 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num, address = regs->pc; break; #ifdef __tilegx__ - case INT_ILL_TRANS: + case INT_ILL_TRANS: { + /* Avoid a hardware erratum with the return address stack. */ + fill_ra_stack(); + signo = SIGSEGV; + address = reason; code = SEGV_MAPERR; - if (reason & SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK) - address = regs->pc; - else - address = 0; /* FIXME: GX: single-step for address */ break; + } #endif default: panic("Unexpected do_trap interrupt number %d", fault_num); @@ -307,7 +388,8 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num, info.si_addr = (void __user *)address; if (signo == SIGILL) info.si_trapno = fault_num; - trace_unhandled_signal("trap", regs, address, signo); + if (signo != SIGTRAP) + trace_unhandled_signal("trap", regs, address, signo); force_sig_info(signo, &info, current); } diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c new file mode 100644 index 00000000000..c02ea2a45f6 --- /dev/null +++ b/arch/tile/kernel/unaligned.c @@ -0,0 +1,1598 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * A code-rewriter that handles unaligned exception. + */ + +#include <linux/smp.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <linux/uaccess.h> +#include <linux/mman.h> +#include <linux/types.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/compat.h> +#include <linux/prctl.h> +#include <asm/cacheflush.h> +#include <asm/traps.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> +#include <arch/abi.h> +#include <arch/spr_def.h> +#include <arch/opcode.h> + + +/* + * This file handles unaligned exception for tile-Gx. The tilepro's unaligned + * exception is supported out of single_step.c + */ + +int unaligned_printk; + +static int __init setup_unaligned_printk(char *str) +{ + long val; + if (kstrtol(str, 0, &val) != 0) + return 0; + unaligned_printk = val; + pr_info("Printk for each unaligned data accesses is %s\n", + unaligned_printk ? "enabled" : "disabled"); + return 1; +} +__setup("unaligned_printk=", setup_unaligned_printk); + +unsigned int unaligned_fixup_count; + +#ifdef __tilegx__ + +/* + * Unalign data jit fixup code fragement. Reserved space is 128 bytes. + * The 1st 64-bit word saves fault PC address, 2nd word is the fault + * instruction bundle followed by 14 JIT bundles. + */ + +struct unaligned_jit_fragment { + unsigned long pc; + tilegx_bundle_bits bundle; + tilegx_bundle_bits insn[14]; +}; + +/* + * Check if a nop or fnop at bundle's pipeline X0. + */ + +static bool is_bundle_x0_nop(tilegx_bundle_bits bundle) +{ + return (((get_UnaryOpcodeExtension_X0(bundle) == + NOP_UNARY_OPCODE_X0) && + (get_RRROpcodeExtension_X0(bundle) == + UNARY_RRR_0_OPCODE_X0) && + (get_Opcode_X0(bundle) == + RRR_0_OPCODE_X0)) || + ((get_UnaryOpcodeExtension_X0(bundle) == + FNOP_UNARY_OPCODE_X0) && + (get_RRROpcodeExtension_X0(bundle) == + UNARY_RRR_0_OPCODE_X0) && + (get_Opcode_X0(bundle) == + RRR_0_OPCODE_X0))); +} + +/* + * Check if nop or fnop at bundle's pipeline X1. + */ + +static bool is_bundle_x1_nop(tilegx_bundle_bits bundle) +{ + return (((get_UnaryOpcodeExtension_X1(bundle) == + NOP_UNARY_OPCODE_X1) && + (get_RRROpcodeExtension_X1(bundle) == + UNARY_RRR_0_OPCODE_X1) && + (get_Opcode_X1(bundle) == + RRR_0_OPCODE_X1)) || + ((get_UnaryOpcodeExtension_X1(bundle) == + FNOP_UNARY_OPCODE_X1) && + (get_RRROpcodeExtension_X1(bundle) == + UNARY_RRR_0_OPCODE_X1) && + (get_Opcode_X1(bundle) == + RRR_0_OPCODE_X1))); +} + +/* + * Check if nop or fnop at bundle's Y0 pipeline. + */ + +static bool is_bundle_y0_nop(tilegx_bundle_bits bundle) +{ + return (((get_UnaryOpcodeExtension_Y0(bundle) == + NOP_UNARY_OPCODE_Y0) && + (get_RRROpcodeExtension_Y0(bundle) == + UNARY_RRR_1_OPCODE_Y0) && + (get_Opcode_Y0(bundle) == + RRR_1_OPCODE_Y0)) || + ((get_UnaryOpcodeExtension_Y0(bundle) == + FNOP_UNARY_OPCODE_Y0) && + (get_RRROpcodeExtension_Y0(bundle) == + UNARY_RRR_1_OPCODE_Y0) && + (get_Opcode_Y0(bundle) == + RRR_1_OPCODE_Y0))); +} + +/* + * Check if nop or fnop at bundle's pipeline Y1. + */ + +static bool is_bundle_y1_nop(tilegx_bundle_bits bundle) +{ + return (((get_UnaryOpcodeExtension_Y1(bundle) == + NOP_UNARY_OPCODE_Y1) && + (get_RRROpcodeExtension_Y1(bundle) == + UNARY_RRR_1_OPCODE_Y1) && + (get_Opcode_Y1(bundle) == + RRR_1_OPCODE_Y1)) || + ((get_UnaryOpcodeExtension_Y1(bundle) == + FNOP_UNARY_OPCODE_Y1) && + (get_RRROpcodeExtension_Y1(bundle) == + UNARY_RRR_1_OPCODE_Y1) && + (get_Opcode_Y1(bundle) == + RRR_1_OPCODE_Y1))); +} + +/* + * Test if a bundle's y0 and y1 pipelines are both nop or fnop. + */ + +static bool is_y0_y1_nop(tilegx_bundle_bits bundle) +{ + return is_bundle_y0_nop(bundle) && is_bundle_y1_nop(bundle); +} + +/* + * Test if a bundle's x0 and x1 pipelines are both nop or fnop. + */ + +static bool is_x0_x1_nop(tilegx_bundle_bits bundle) +{ + return is_bundle_x0_nop(bundle) && is_bundle_x1_nop(bundle); +} + +/* + * Find the destination, source registers of fault unalign access instruction + * at X1 or Y2. Also, allocate up to 3 scratch registers clob1, clob2 and + * clob3, which are guaranteed different from any register used in the fault + * bundle. r_alias is used to return if the other instructions other than the + * unalign load/store shares same register with ra, rb and rd. + */ + +static void find_regs(tilegx_bundle_bits bundle, uint64_t *rd, uint64_t *ra, + uint64_t *rb, uint64_t *clob1, uint64_t *clob2, + uint64_t *clob3, bool *r_alias) +{ + int i; + uint64_t reg; + uint64_t reg_map = 0, alias_reg_map = 0, map; + bool alias = false; + + /* + * Parse fault bundle, find potential used registers and mark + * corresponding bits in reg_map and alias_map. These 2 bit maps + * are used to find the scratch registers and determine if there + * is register alais. + */ + if (bundle & TILEGX_BUNDLE_MODE_MASK) { /* Y Mode Bundle. */ + + reg = get_SrcA_Y2(bundle); + reg_map |= 1ULL << reg; + *ra = reg; + reg = get_SrcBDest_Y2(bundle); + reg_map |= 1ULL << reg; + + if (rd) { + /* Load. */ + *rd = reg; + alias_reg_map = (1ULL << *rd) | (1ULL << *ra); + } else { + /* Store. */ + *rb = reg; + alias_reg_map = (1ULL << *ra) | (1ULL << *rb); + } + + if (!is_bundle_y1_nop(bundle)) { + reg = get_SrcA_Y1(bundle); + reg_map |= (1ULL << reg); + map = (1ULL << reg); + + reg = get_SrcB_Y1(bundle); + reg_map |= (1ULL << reg); + map |= (1ULL << reg); + + reg = get_Dest_Y1(bundle); + reg_map |= (1ULL << reg); + map |= (1ULL << reg); + + if (map & alias_reg_map) + alias = true; + } + + if (!is_bundle_y0_nop(bundle)) { + reg = get_SrcA_Y0(bundle); + reg_map |= (1ULL << reg); + map = (1ULL << reg); + + reg = get_SrcB_Y0(bundle); + reg_map |= (1ULL << reg); + map |= (1ULL << reg); + + reg = get_Dest_Y0(bundle); + reg_map |= (1ULL << reg); + map |= (1ULL << reg); + + if (map & alias_reg_map) + alias = true; + } + } else { /* X Mode Bundle. */ + + reg = get_SrcA_X1(bundle); + reg_map |= (1ULL << reg); + *ra = reg; + if (rd) { + /* Load. */ + reg = get_Dest_X1(bundle); + reg_map |= (1ULL << reg); + *rd = reg; + alias_reg_map = (1ULL << *rd) | (1ULL << *ra); + } else { + /* Store. */ + reg = get_SrcB_X1(bundle); + reg_map |= (1ULL << reg); + *rb = reg; + alias_reg_map = (1ULL << *ra) | (1ULL << *rb); + } + + if (!is_bundle_x0_nop(bundle)) { + reg = get_SrcA_X0(bundle); + reg_map |= (1ULL << reg); + map = (1ULL << reg); + + reg = get_SrcB_X0(bundle); + reg_map |= (1ULL << reg); + map |= (1ULL << reg); + + reg = get_Dest_X0(bundle); + reg_map |= (1ULL << reg); + map |= (1ULL << reg); + + if (map & alias_reg_map) + alias = true; + } + } + + /* + * "alias" indicates if the unalign access registers have collision + * with others in the same bundle. We jsut simply test all register + * operands case (RRR), ignored the case with immidate. If a bundle + * has no register alias, we may do fixup in a simple or fast manner. + * So if an immidata field happens to hit with a register, we may end + * up fall back to the generic handling. + */ + + *r_alias = alias; + + /* Flip bits on reg_map. */ + reg_map ^= -1ULL; + + /* Scan reg_map lower 54(TREG_SP) bits to find 3 set bits. */ + for (i = 0; i < TREG_SP; i++) { + if (reg_map & (0x1ULL << i)) { + if (*clob1 == -1) { + *clob1 = i; + } else if (*clob2 == -1) { + *clob2 = i; + } else if (*clob3 == -1) { + *clob3 = i; + return; + } + } + } +} + +/* + * Sanity check for register ra, rb, rd, clob1/2/3. Return true if any of them + * is unexpected. + */ + +static bool check_regs(uint64_t rd, uint64_t ra, uint64_t rb, + uint64_t clob1, uint64_t clob2, uint64_t clob3) +{ + bool unexpected = false; + if ((ra >= 56) && (ra != TREG_ZERO)) + unexpected = true; + + if ((clob1 >= 56) || (clob2 >= 56) || (clob3 >= 56)) + unexpected = true; + + if (rd != -1) { + if ((rd >= 56) && (rd != TREG_ZERO)) + unexpected = true; + } else { + if ((rb >= 56) && (rb != TREG_ZERO)) + unexpected = true; + } + return unexpected; +} + + +#define GX_INSN_X0_MASK ((1ULL << 31) - 1) +#define GX_INSN_X1_MASK (((1ULL << 31) - 1) << 31) +#define GX_INSN_Y0_MASK ((0xFULL << 27) | (0xFFFFFULL)) +#define GX_INSN_Y1_MASK (GX_INSN_Y0_MASK << 31) +#define GX_INSN_Y2_MASK ((0x7FULL << 51) | (0x7FULL << 20)) + +#ifdef __LITTLE_ENDIAN +#define GX_INSN_BSWAP(_bundle_) (_bundle_) +#else +#define GX_INSN_BSWAP(_bundle_) swab64(_bundle_) +#endif /* __LITTLE_ENDIAN */ + +/* + * __JIT_CODE(.) creates template bundles in .rodata.unalign_data section. + * The corresponding static function jix_x#_###(.) generates partial or + * whole bundle based on the template and given arguments. + */ + +#define __JIT_CODE(_X_) \ + asm (".pushsection .rodata.unalign_data, \"a\"\n" \ + _X_"\n" \ + ".popsection\n") + +__JIT_CODE("__unalign_jit_x1_mtspr: {mtspr 0, r0}"); +static tilegx_bundle_bits jit_x1_mtspr(int spr, int reg) +{ + extern tilegx_bundle_bits __unalign_jit_x1_mtspr; + return (GX_INSN_BSWAP(__unalign_jit_x1_mtspr) & GX_INSN_X1_MASK) | + create_MT_Imm14_X1(spr) | create_SrcA_X1(reg); +} + +__JIT_CODE("__unalign_jit_x1_mfspr: {mfspr r0, 0}"); +static tilegx_bundle_bits jit_x1_mfspr(int reg, int spr) +{ + extern tilegx_bundle_bits __unalign_jit_x1_mfspr; + return (GX_INSN_BSWAP(__unalign_jit_x1_mfspr) & GX_INSN_X1_MASK) | + create_MF_Imm14_X1(spr) | create_Dest_X1(reg); +} + +__JIT_CODE("__unalign_jit_x0_addi: {addi r0, r0, 0; iret}"); +static tilegx_bundle_bits jit_x0_addi(int rd, int ra, int imm8) +{ + extern tilegx_bundle_bits __unalign_jit_x0_addi; + return (GX_INSN_BSWAP(__unalign_jit_x0_addi) & GX_INSN_X0_MASK) | + create_Dest_X0(rd) | create_SrcA_X0(ra) | + create_Imm8_X0(imm8); +} + +__JIT_CODE("__unalign_jit_x1_ldna: {ldna r0, r0}"); +static tilegx_bundle_bits jit_x1_ldna(int rd, int ra) +{ + extern tilegx_bundle_bits __unalign_jit_x1_ldna; + return (GX_INSN_BSWAP(__unalign_jit_x1_ldna) & GX_INSN_X1_MASK) | + create_Dest_X1(rd) | create_SrcA_X1(ra); +} + +__JIT_CODE("__unalign_jit_x0_dblalign: {dblalign r0, r0 ,r0}"); +static tilegx_bundle_bits jit_x0_dblalign(int rd, int ra, int rb) +{ + extern tilegx_bundle_bits __unalign_jit_x0_dblalign; + return (GX_INSN_BSWAP(__unalign_jit_x0_dblalign) & GX_INSN_X0_MASK) | + create_Dest_X0(rd) | create_SrcA_X0(ra) | + create_SrcB_X0(rb); +} + +__JIT_CODE("__unalign_jit_x1_iret: {iret}"); +static tilegx_bundle_bits jit_x1_iret(void) +{ + extern tilegx_bundle_bits __unalign_jit_x1_iret; + return GX_INSN_BSWAP(__unalign_jit_x1_iret) & GX_INSN_X1_MASK; +} + +__JIT_CODE("__unalign_jit_x01_fnop: {fnop;fnop}"); +static tilegx_bundle_bits jit_x0_fnop(void) +{ + extern tilegx_bundle_bits __unalign_jit_x01_fnop; + return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X0_MASK; +} + +static tilegx_bundle_bits jit_x1_fnop(void) +{ + extern tilegx_bundle_bits __unalign_jit_x01_fnop; + return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X1_MASK; +} + +__JIT_CODE("__unalign_jit_y2_dummy: {fnop; fnop; ld zero, sp}"); +static tilegx_bundle_bits jit_y2_dummy(void) +{ + extern tilegx_bundle_bits __unalign_jit_y2_dummy; + return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y2_MASK; +} + +static tilegx_bundle_bits jit_y1_fnop(void) +{ + extern tilegx_bundle_bits __unalign_jit_y2_dummy; + return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y1_MASK; +} + +__JIT_CODE("__unalign_jit_x1_st1_add: {st1_add r1, r0, 0}"); +static tilegx_bundle_bits jit_x1_st1_add(int ra, int rb, int imm8) +{ + extern tilegx_bundle_bits __unalign_jit_x1_st1_add; + return (GX_INSN_BSWAP(__unalign_jit_x1_st1_add) & + (~create_SrcA_X1(-1)) & + GX_INSN_X1_MASK) | create_SrcA_X1(ra) | + create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8); +} + +__JIT_CODE("__unalign_jit_x1_st: {crc32_8 r1, r0, r0; st r0, r0}"); +static tilegx_bundle_bits jit_x1_st(int ra, int rb) +{ + extern tilegx_bundle_bits __unalign_jit_x1_st; + return (GX_INSN_BSWAP(__unalign_jit_x1_st) & GX_INSN_X1_MASK) | + create_SrcA_X1(ra) | create_SrcB_X1(rb); +} + +__JIT_CODE("__unalign_jit_x1_st_add: {st_add r1, r0, 0}"); +static tilegx_bundle_bits jit_x1_st_add(int ra, int rb, int imm8) +{ + extern tilegx_bundle_bits __unalign_jit_x1_st_add; + return (GX_INSN_BSWAP(__unalign_jit_x1_st_add) & + (~create_SrcA_X1(-1)) & + GX_INSN_X1_MASK) | create_SrcA_X1(ra) | + create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8); +} + +__JIT_CODE("__unalign_jit_x1_ld: {crc32_8 r1, r0, r0; ld r0, r0}"); +static tilegx_bundle_bits jit_x1_ld(int rd, int ra) +{ + extern tilegx_bundle_bits __unalign_jit_x1_ld; + return (GX_INSN_BSWAP(__unalign_jit_x1_ld) & GX_INSN_X1_MASK) | + create_Dest_X1(rd) | create_SrcA_X1(ra); +} + +__JIT_CODE("__unalign_jit_x1_ld_add: {ld_add r1, r0, 0}"); +static tilegx_bundle_bits jit_x1_ld_add(int rd, int ra, int imm8) +{ + extern tilegx_bundle_bits __unalign_jit_x1_ld_add; + return (GX_INSN_BSWAP(__unalign_jit_x1_ld_add) & + (~create_Dest_X1(-1)) & + GX_INSN_X1_MASK) | create_Dest_X1(rd) | + create_SrcA_X1(ra) | create_Imm8_X1(imm8); +} + +__JIT_CODE("__unalign_jit_x0_bfexts: {bfexts r0, r0, 0, 0}"); +static tilegx_bundle_bits jit_x0_bfexts(int rd, int ra, int bfs, int bfe) +{ + extern tilegx_bundle_bits __unalign_jit_x0_bfexts; + return (GX_INSN_BSWAP(__unalign_jit_x0_bfexts) & + GX_INSN_X0_MASK) | + create_Dest_X0(rd) | create_SrcA_X0(ra) | + create_BFStart_X0(bfs) | create_BFEnd_X0(bfe); +} + +__JIT_CODE("__unalign_jit_x0_bfextu: {bfextu r0, r0, 0, 0}"); +static tilegx_bundle_bits jit_x0_bfextu(int rd, int ra, int bfs, int bfe) +{ + extern tilegx_bundle_bits __unalign_jit_x0_bfextu; + return (GX_INSN_BSWAP(__unalign_jit_x0_bfextu) & + GX_INSN_X0_MASK) | + create_Dest_X0(rd) | create_SrcA_X0(ra) | + create_BFStart_X0(bfs) | create_BFEnd_X0(bfe); +} + +__JIT_CODE("__unalign_jit_x1_addi: {bfextu r1, r1, 0, 0; addi r0, r0, 0}"); +static tilegx_bundle_bits jit_x1_addi(int rd, int ra, int imm8) +{ + extern tilegx_bundle_bits __unalign_jit_x1_addi; + return (GX_INSN_BSWAP(__unalign_jit_x1_addi) & GX_INSN_X1_MASK) | + create_Dest_X1(rd) | create_SrcA_X1(ra) | + create_Imm8_X1(imm8); +} + +__JIT_CODE("__unalign_jit_x0_shrui: {shrui r0, r0, 0; iret}"); +static tilegx_bundle_bits jit_x0_shrui(int rd, int ra, int imm6) +{ + extern tilegx_bundle_bits __unalign_jit_x0_shrui; + return (GX_INSN_BSWAP(__unalign_jit_x0_shrui) & + GX_INSN_X0_MASK) | + create_Dest_X0(rd) | create_SrcA_X0(ra) | + create_ShAmt_X0(imm6); +} + +__JIT_CODE("__unalign_jit_x0_rotli: {rotli r0, r0, 0; iret}"); +static tilegx_bundle_bits jit_x0_rotli(int rd, int ra, int imm6) +{ + extern tilegx_bundle_bits __unalign_jit_x0_rotli; + return (GX_INSN_BSWAP(__unalign_jit_x0_rotli) & + GX_INSN_X0_MASK) | + create_Dest_X0(rd) | create_SrcA_X0(ra) | + create_ShAmt_X0(imm6); +} + +__JIT_CODE("__unalign_jit_x1_bnezt: {bnezt r0, __unalign_jit_x1_bnezt}"); +static tilegx_bundle_bits jit_x1_bnezt(int ra, int broff) +{ + extern tilegx_bundle_bits __unalign_jit_x1_bnezt; + return (GX_INSN_BSWAP(__unalign_jit_x1_bnezt) & + GX_INSN_X1_MASK) | + create_SrcA_X1(ra) | create_BrOff_X1(broff); +} + +#undef __JIT_CODE + +/* + * This function generates unalign fixup JIT. + * + * We first find unalign load/store instruction's destination, source + * registers: ra, rb and rd. and 3 scratch registers by calling + * find_regs(...). 3 scratch clobbers should not alias with any register + * used in the fault bundle. Then analyze the fault bundle to determine + * if it's a load or store, operand width, branch or address increment etc. + * At last generated JIT is copied into JIT code area in user space. + */ + +static +void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle, + int align_ctl) +{ + struct thread_info *info = current_thread_info(); + struct unaligned_jit_fragment frag; + struct unaligned_jit_fragment *jit_code_area; + tilegx_bundle_bits bundle_2 = 0; + /* If bundle_2_enable = false, bundle_2 is fnop/nop operation. */ + bool bundle_2_enable = true; + uint64_t ra = -1, rb = -1, rd = -1, clob1 = -1, clob2 = -1, clob3 = -1; + /* + * Indicate if the unalign access + * instruction's registers hit with + * others in the same bundle. + */ + bool alias = false; + bool load_n_store = true; + bool load_store_signed = false; + unsigned int load_store_size = 8; + bool y1_br = false; /* True, for a branch in same bundle at Y1.*/ + int y1_br_reg = 0; + /* True for link operation. i.e. jalr or lnk at Y1 */ + bool y1_lr = false; + int y1_lr_reg = 0; + bool x1_add = false;/* True, for load/store ADD instruction at X1*/ + int x1_add_imm8 = 0; + bool unexpected = false; + int n = 0, k; + + jit_code_area = + (struct unaligned_jit_fragment *)(info->unalign_jit_base); + + memset((void *)&frag, 0, sizeof(frag)); + + /* 0: X mode, Otherwise: Y mode. */ + if (bundle & TILEGX_BUNDLE_MODE_MASK) { + unsigned int mod, opcode; + + if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 && + get_RRROpcodeExtension_Y1(bundle) == + UNARY_RRR_1_OPCODE_Y1) { + + opcode = get_UnaryOpcodeExtension_Y1(bundle); + + /* + * Test "jalr", "jalrp", "jr", "jrp" instruction at Y1 + * pipeline. + */ + switch (opcode) { + case JALR_UNARY_OPCODE_Y1: + case JALRP_UNARY_OPCODE_Y1: + y1_lr = true; + y1_lr_reg = 55; /* Link register. */ + /* FALLTHROUGH */ + case JR_UNARY_OPCODE_Y1: + case JRP_UNARY_OPCODE_Y1: + y1_br = true; + y1_br_reg = get_SrcA_Y1(bundle); + break; + case LNK_UNARY_OPCODE_Y1: + /* "lnk" at Y1 pipeline. */ + y1_lr = true; + y1_lr_reg = get_Dest_Y1(bundle); + break; + } + } + + opcode = get_Opcode_Y2(bundle); + mod = get_Mode(bundle); + + /* + * bundle_2 is bundle after making Y2 as a dummy operation + * - ld zero, sp + */ + bundle_2 = (bundle & (~GX_INSN_Y2_MASK)) | jit_y2_dummy(); + + /* Make Y1 as fnop if Y1 is a branch or lnk operation. */ + if (y1_br || y1_lr) { + bundle_2 &= ~(GX_INSN_Y1_MASK); + bundle_2 |= jit_y1_fnop(); + } + + if (is_y0_y1_nop(bundle_2)) + bundle_2_enable = false; + + if (mod == MODE_OPCODE_YC2) { + /* Store. */ + load_n_store = false; + load_store_size = 1 << opcode; + load_store_signed = false; + find_regs(bundle, 0, &ra, &rb, &clob1, &clob2, + &clob3, &alias); + if (load_store_size > 8) + unexpected = true; + } else { + /* Load. */ + load_n_store = true; + if (mod == MODE_OPCODE_YB2) { + switch (opcode) { + case LD_OPCODE_Y2: + load_store_signed = false; + load_store_size = 8; + break; + case LD4S_OPCODE_Y2: + load_store_signed = true; + load_store_size = 4; + break; + case LD4U_OPCODE_Y2: + load_store_signed = false; + load_store_size = 4; + break; + default: + unexpected = true; + } + } else if (mod == MODE_OPCODE_YA2) { + if (opcode == LD2S_OPCODE_Y2) { + load_store_signed = true; + load_store_size = 2; + } else if (opcode == LD2U_OPCODE_Y2) { + load_store_signed = false; + load_store_size = 2; + } else + unexpected = true; + } else + unexpected = true; + find_regs(bundle, &rd, &ra, &rb, &clob1, &clob2, + &clob3, &alias); + } + } else { + unsigned int opcode; + + /* bundle_2 is bundle after making X1 as "fnop". */ + bundle_2 = (bundle & (~GX_INSN_X1_MASK)) | jit_x1_fnop(); + + if (is_x0_x1_nop(bundle_2)) + bundle_2_enable = false; + + if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) { + opcode = get_UnaryOpcodeExtension_X1(bundle); + + if (get_RRROpcodeExtension_X1(bundle) == + UNARY_RRR_0_OPCODE_X1) { + load_n_store = true; + find_regs(bundle, &rd, &ra, &rb, &clob1, + &clob2, &clob3, &alias); + + switch (opcode) { + case LD_UNARY_OPCODE_X1: + load_store_signed = false; + load_store_size = 8; + break; + case LD4S_UNARY_OPCODE_X1: + load_store_signed = true; + /* FALLTHROUGH */ + case LD4U_UNARY_OPCODE_X1: + load_store_size = 4; + break; + + case LD2S_UNARY_OPCODE_X1: + load_store_signed = true; + /* FALLTHROUGH */ + case LD2U_UNARY_OPCODE_X1: + load_store_size = 2; + break; + default: + unexpected = true; + } + } else { + load_n_store = false; + load_store_signed = false; + find_regs(bundle, 0, &ra, &rb, + &clob1, &clob2, &clob3, + &alias); + + opcode = get_RRROpcodeExtension_X1(bundle); + switch (opcode) { + case ST_RRR_0_OPCODE_X1: + load_store_size = 8; + break; + case ST4_RRR_0_OPCODE_X1: + load_store_size = 4; + break; + case ST2_RRR_0_OPCODE_X1: + load_store_size = 2; + break; + default: + unexpected = true; + } + } + } else if (get_Opcode_X1(bundle) == IMM8_OPCODE_X1) { + load_n_store = true; + opcode = get_Imm8OpcodeExtension_X1(bundle); + switch (opcode) { + case LD_ADD_IMM8_OPCODE_X1: + load_store_size = 8; + break; + + case LD4S_ADD_IMM8_OPCODE_X1: + load_store_signed = true; + /* FALLTHROUGH */ + case LD4U_ADD_IMM8_OPCODE_X1: + load_store_size = 4; + break; + + case LD2S_ADD_IMM8_OPCODE_X1: + load_store_signed = true; + /* FALLTHROUGH */ + case LD2U_ADD_IMM8_OPCODE_X1: + load_store_size = 2; + break; + + case ST_ADD_IMM8_OPCODE_X1: + load_n_store = false; + load_store_size = 8; + break; + case ST4_ADD_IMM8_OPCODE_X1: + load_n_store = false; + load_store_size = 4; + break; + case ST2_ADD_IMM8_OPCODE_X1: + load_n_store = false; + load_store_size = 2; + break; + default: + unexpected = true; + } + + if (!unexpected) { + x1_add = true; + if (load_n_store) + x1_add_imm8 = get_Imm8_X1(bundle); + else + x1_add_imm8 = get_Dest_Imm8_X1(bundle); + } + + find_regs(bundle, load_n_store ? (&rd) : NULL, + &ra, &rb, &clob1, &clob2, &clob3, &alias); + } else + unexpected = true; + } + + /* + * Some sanity check for register numbers extracted from fault bundle. + */ + if (check_regs(rd, ra, rb, clob1, clob2, clob3) == true) + unexpected = true; + + /* Give warning if register ra has an aligned address. */ + if (!unexpected) + WARN_ON(!((load_store_size - 1) & (regs->regs[ra]))); + + + /* + * Fault came from kernel space, here we only need take care of + * unaligned "get_user/put_user" macros defined in "uaccess.h". + * Basically, we will handle bundle like this: + * {ld/2u/4s rd, ra; movei rx, 0} or {st/2/4 ra, rb; movei rx, 0} + * (Refer to file "arch/tile/include/asm/uaccess.h" for details). + * For either load or store, byte-wise operation is performed by calling + * get_user() or put_user(). If the macro returns non-zero value, + * set the value to rx, otherwise set zero to rx. Finally make pc point + * to next bundle and return. + */ + + if (EX1_PL(regs->ex1) != USER_PL) { + + unsigned long rx = 0; + unsigned long x = 0, ret = 0; + + if (y1_br || y1_lr || x1_add || + (load_store_signed != + (load_n_store && load_store_size == 4))) { + /* No branch, link, wrong sign-ext or load/store add. */ + unexpected = true; + } else if (!unexpected) { + if (bundle & TILEGX_BUNDLE_MODE_MASK) { + /* + * Fault bundle is Y mode. + * Check if the Y1 and Y0 is the form of + * { movei rx, 0; nop/fnop }, if yes, + * find the rx. + */ + + if ((get_Opcode_Y1(bundle) == ADDI_OPCODE_Y1) + && (get_SrcA_Y1(bundle) == TREG_ZERO) && + (get_Imm8_Y1(bundle) == 0) && + is_bundle_y0_nop(bundle)) { + rx = get_Dest_Y1(bundle); + } else if ((get_Opcode_Y0(bundle) == + ADDI_OPCODE_Y0) && + (get_SrcA_Y0(bundle) == TREG_ZERO) && + (get_Imm8_Y0(bundle) == 0) && + is_bundle_y1_nop(bundle)) { + rx = get_Dest_Y0(bundle); + } else { + unexpected = true; + } + } else { + /* + * Fault bundle is X mode. + * Check if the X0 is 'movei rx, 0', + * if yes, find the rx. + */ + + if ((get_Opcode_X0(bundle) == IMM8_OPCODE_X0) + && (get_Imm8OpcodeExtension_X0(bundle) == + ADDI_IMM8_OPCODE_X0) && + (get_SrcA_X0(bundle) == TREG_ZERO) && + (get_Imm8_X0(bundle) == 0)) { + rx = get_Dest_X0(bundle); + } else { + unexpected = true; + } + } + + /* rx should be less than 56. */ + if (!unexpected && (rx >= 56)) + unexpected = true; + } + + if (!search_exception_tables(regs->pc)) { + /* No fixup in the exception tables for the pc. */ + unexpected = true; + } + + if (unexpected) { + /* Unexpected unalign kernel fault. */ + struct task_struct *tsk = validate_current(); + + bust_spinlocks(1); + + show_regs(regs); + + if (unlikely(tsk->pid < 2)) { + panic("Kernel unalign fault running %s!", + tsk->pid ? "init" : "the idle task"); + } +#ifdef SUPPORT_DIE + die("Oops", regs); +#endif + bust_spinlocks(1); + + do_group_exit(SIGKILL); + + } else { + unsigned long i, b = 0; + unsigned char *ptr = + (unsigned char *)regs->regs[ra]; + if (load_n_store) { + /* handle get_user(x, ptr) */ + for (i = 0; i < load_store_size; i++) { + ret = get_user(b, ptr++); + if (!ret) { + /* Success! update x. */ +#ifdef __LITTLE_ENDIAN + x |= (b << (8 * i)); +#else + x <<= 8; + x |= b; +#endif /* __LITTLE_ENDIAN */ + } else { + x = 0; + break; + } + } + + /* Sign-extend 4-byte loads. */ + if (load_store_size == 4) + x = (long)(int)x; + + /* Set register rd. */ + regs->regs[rd] = x; + + /* Set register rx. */ + regs->regs[rx] = ret; + + /* Bump pc. */ + regs->pc += 8; + + } else { + /* Handle put_user(x, ptr) */ + x = regs->regs[rb]; +#ifdef __LITTLE_ENDIAN + b = x; +#else + /* + * Swap x in order to store x from low + * to high memory same as the + * little-endian case. + */ + switch (load_store_size) { + case 8: + b = swab64(x); + break; + case 4: + b = swab32(x); + break; + case 2: + b = swab16(x); + break; + } +#endif /* __LITTLE_ENDIAN */ + for (i = 0; i < load_store_size; i++) { + ret = put_user(b, ptr++); + if (ret) + break; + /* Success! shift 1 byte. */ + b >>= 8; + } + /* Set register rx. */ + regs->regs[rx] = ret; + + /* Bump pc. */ + regs->pc += 8; + } + } + + unaligned_fixup_count++; + + if (unaligned_printk) { + pr_info("%s/%d. Unalign fixup for kernel access " + "to userspace %lx.", + current->comm, current->pid, regs->regs[ra]); + } + + /* Done! Return to the exception handler. */ + return; + } + + if ((align_ctl == 0) || unexpected) { + siginfo_t info = { + .si_signo = SIGBUS, + .si_code = BUS_ADRALN, + .si_addr = (unsigned char __user *)0 + }; + if (unaligned_printk) + pr_info("Unalign bundle: unexp @%llx, %llx", + (unsigned long long)regs->pc, + (unsigned long long)bundle); + + if (ra < 56) { + unsigned long uaa = (unsigned long)regs->regs[ra]; + /* Set bus Address. */ + info.si_addr = (unsigned char __user *)uaa; + } + + unaligned_fixup_count++; + + trace_unhandled_signal("unaligned fixup trap", regs, + (unsigned long)info.si_addr, SIGBUS); + force_sig_info(info.si_signo, &info, current); + return; + } + +#ifdef __LITTLE_ENDIAN +#define UA_FIXUP_ADDR_DELTA 1 +#define UA_FIXUP_BFEXT_START(_B_) 0 +#define UA_FIXUP_BFEXT_END(_B_) (8 * (_B_) - 1) +#else /* __BIG_ENDIAN */ +#define UA_FIXUP_ADDR_DELTA -1 +#define UA_FIXUP_BFEXT_START(_B_) (64 - 8 * (_B_)) +#define UA_FIXUP_BFEXT_END(_B_) 63 +#endif /* __LITTLE_ENDIAN */ + + + + if ((ra != rb) && (rd != TREG_SP) && !alias && + !y1_br && !y1_lr && !x1_add) { + /* + * Simple case: ra != rb and no register alias found, + * and no branch or link. This will be the majority. + * We can do a little better for simplae case than the + * generic scheme below. + */ + if (!load_n_store) { + /* + * Simple store: ra != rb, no need for scratch register. + * Just store and rotate to right bytewise. + */ +#ifdef __BIG_ENDIAN + frag.insn[n++] = + jit_x0_addi(ra, ra, load_store_size - 1) | + jit_x1_fnop(); +#endif /* __BIG_ENDIAN */ + for (k = 0; k < load_store_size; k++) { + /* Store a byte. */ + frag.insn[n++] = + jit_x0_rotli(rb, rb, 56) | + jit_x1_st1_add(ra, rb, + UA_FIXUP_ADDR_DELTA); + } +#ifdef __BIG_ENDIAN + frag.insn[n] = jit_x1_addi(ra, ra, 1); +#else + frag.insn[n] = jit_x1_addi(ra, ra, + -1 * load_store_size); +#endif /* __LITTLE_ENDIAN */ + + if (load_store_size == 8) { + frag.insn[n] |= jit_x0_fnop(); + } else if (load_store_size == 4) { + frag.insn[n] |= jit_x0_rotli(rb, rb, 32); + } else { /* = 2 */ + frag.insn[n] |= jit_x0_rotli(rb, rb, 16); + } + n++; + if (bundle_2_enable) + frag.insn[n++] = bundle_2; + frag.insn[n++] = jit_x0_fnop() | jit_x1_iret(); + } else { + if (rd == ra) { + /* Use two clobber registers: clob1/2. */ + frag.insn[n++] = + jit_x0_addi(TREG_SP, TREG_SP, -16) | + jit_x1_fnop(); + frag.insn[n++] = + jit_x0_addi(clob1, ra, 7) | + jit_x1_st_add(TREG_SP, clob1, -8); + frag.insn[n++] = + jit_x0_addi(clob2, ra, 0) | + jit_x1_st(TREG_SP, clob2); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ldna(rd, ra); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ldna(clob1, clob1); + /* + * Note: we must make sure that rd must not + * be sp. Recover clob1/2 from stack. + */ + frag.insn[n++] = + jit_x0_dblalign(rd, clob1, clob2) | + jit_x1_ld_add(clob2, TREG_SP, 8); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ld_add(clob1, TREG_SP, 16); + } else { + /* Use one clobber register: clob1 only. */ + frag.insn[n++] = + jit_x0_addi(TREG_SP, TREG_SP, -16) | + jit_x1_fnop(); + frag.insn[n++] = + jit_x0_addi(clob1, ra, 7) | + jit_x1_st(TREG_SP, clob1); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ldna(rd, ra); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ldna(clob1, clob1); + /* + * Note: we must make sure that rd must not + * be sp. Recover clob1 from stack. + */ + frag.insn[n++] = + jit_x0_dblalign(rd, clob1, ra) | + jit_x1_ld_add(clob1, TREG_SP, 16); + } + + if (bundle_2_enable) + frag.insn[n++] = bundle_2; + /* + * For non 8-byte load, extract corresponding bytes and + * signed extension. + */ + if (load_store_size == 4) { + if (load_store_signed) + frag.insn[n++] = + jit_x0_bfexts( + rd, rd, + UA_FIXUP_BFEXT_START(4), + UA_FIXUP_BFEXT_END(4)) | + jit_x1_fnop(); + else + frag.insn[n++] = + jit_x0_bfextu( + rd, rd, + UA_FIXUP_BFEXT_START(4), + UA_FIXUP_BFEXT_END(4)) | + jit_x1_fnop(); + } else if (load_store_size == 2) { + if (load_store_signed) + frag.insn[n++] = + jit_x0_bfexts( + rd, rd, + UA_FIXUP_BFEXT_START(2), + UA_FIXUP_BFEXT_END(2)) | + jit_x1_fnop(); + else + frag.insn[n++] = + jit_x0_bfextu( + rd, rd, + UA_FIXUP_BFEXT_START(2), + UA_FIXUP_BFEXT_END(2)) | + jit_x1_fnop(); + } + + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_iret(); + } + } else if (!load_n_store) { + + /* + * Generic memory store cases: use 3 clobber registers. + * + * Alloc space for saveing clob2,1,3 on user's stack. + * register clob3 points to where clob2 saved, followed by + * clob1 and 3 from high to low memory. + */ + frag.insn[n++] = + jit_x0_addi(TREG_SP, TREG_SP, -32) | + jit_x1_fnop(); + frag.insn[n++] = + jit_x0_addi(clob3, TREG_SP, 16) | + jit_x1_st_add(TREG_SP, clob3, 8); +#ifdef __LITTLE_ENDIAN + frag.insn[n++] = + jit_x0_addi(clob1, ra, 0) | + jit_x1_st_add(TREG_SP, clob1, 8); +#else + frag.insn[n++] = + jit_x0_addi(clob1, ra, load_store_size - 1) | + jit_x1_st_add(TREG_SP, clob1, 8); +#endif + if (load_store_size == 8) { + /* + * We save one byte a time, not for fast, but compact + * code. After each store, data source register shift + * right one byte. unchanged after 8 stores. + */ + frag.insn[n++] = + jit_x0_addi(clob2, TREG_ZERO, 7) | + jit_x1_st_add(TREG_SP, clob2, 16); + frag.insn[n++] = + jit_x0_rotli(rb, rb, 56) | + jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA); + frag.insn[n++] = + jit_x0_addi(clob2, clob2, -1) | + jit_x1_bnezt(clob2, -1); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_addi(clob2, y1_br_reg, 0); + } else if (load_store_size == 4) { + frag.insn[n++] = + jit_x0_addi(clob2, TREG_ZERO, 3) | + jit_x1_st_add(TREG_SP, clob2, 16); + frag.insn[n++] = + jit_x0_rotli(rb, rb, 56) | + jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA); + frag.insn[n++] = + jit_x0_addi(clob2, clob2, -1) | + jit_x1_bnezt(clob2, -1); + /* + * same as 8-byte case, but need shift another 4 + * byte to recover rb for 4-byte store. + */ + frag.insn[n++] = jit_x0_rotli(rb, rb, 32) | + jit_x1_addi(clob2, y1_br_reg, 0); + } else { /* =2 */ + frag.insn[n++] = + jit_x0_addi(clob2, rb, 0) | + jit_x1_st_add(TREG_SP, clob2, 16); + for (k = 0; k < 2; k++) { + frag.insn[n++] = + jit_x0_shrui(rb, rb, 8) | + jit_x1_st1_add(clob1, rb, + UA_FIXUP_ADDR_DELTA); + } + frag.insn[n++] = + jit_x0_addi(rb, clob2, 0) | + jit_x1_addi(clob2, y1_br_reg, 0); + } + + if (bundle_2_enable) + frag.insn[n++] = bundle_2; + + if (y1_lr) { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_mfspr(y1_lr_reg, + SPR_EX_CONTEXT_0_0); + } + if (y1_br) { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_mtspr(SPR_EX_CONTEXT_0_0, + clob2); + } + if (x1_add) { + frag.insn[n++] = + jit_x0_addi(ra, ra, x1_add_imm8) | + jit_x1_ld_add(clob2, clob3, -8); + } else { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ld_add(clob2, clob3, -8); + } + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ld_add(clob1, clob3, -8); + frag.insn[n++] = jit_x0_fnop() | jit_x1_ld(clob3, clob3); + frag.insn[n++] = jit_x0_fnop() | jit_x1_iret(); + + } else { + /* + * Generic memory load cases. + * + * Alloc space for saveing clob1,2,3 on user's stack. + * register clob3 points to where clob1 saved, followed + * by clob2 and 3 from high to low memory. + */ + + frag.insn[n++] = + jit_x0_addi(TREG_SP, TREG_SP, -32) | + jit_x1_fnop(); + frag.insn[n++] = + jit_x0_addi(clob3, TREG_SP, 16) | + jit_x1_st_add(TREG_SP, clob3, 8); + frag.insn[n++] = + jit_x0_addi(clob2, ra, 0) | + jit_x1_st_add(TREG_SP, clob2, 8); + + if (y1_br) { + frag.insn[n++] = + jit_x0_addi(clob1, y1_br_reg, 0) | + jit_x1_st_add(TREG_SP, clob1, 16); + } else { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_st_add(TREG_SP, clob1, 16); + } + + if (bundle_2_enable) + frag.insn[n++] = bundle_2; + + if (y1_lr) { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_mfspr(y1_lr_reg, + SPR_EX_CONTEXT_0_0); + } + + if (y1_br) { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_mtspr(SPR_EX_CONTEXT_0_0, + clob1); + } + + frag.insn[n++] = + jit_x0_addi(clob1, clob2, 7) | + jit_x1_ldna(rd, clob2); + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ldna(clob1, clob1); + frag.insn[n++] = + jit_x0_dblalign(rd, clob1, clob2) | + jit_x1_ld_add(clob1, clob3, -8); + if (x1_add) { + frag.insn[n++] = + jit_x0_addi(ra, ra, x1_add_imm8) | + jit_x1_ld_add(clob2, clob3, -8); + } else { + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ld_add(clob2, clob3, -8); + } + + frag.insn[n++] = + jit_x0_fnop() | + jit_x1_ld(clob3, clob3); + + if (load_store_size == 4) { + if (load_store_signed) + frag.insn[n++] = + jit_x0_bfexts( + rd, rd, + UA_FIXUP_BFEXT_START(4), + UA_FIXUP_BFEXT_END(4)) | + jit_x1_fnop(); + else + frag.insn[n++] = + jit_x0_bfextu( + rd, rd, + UA_FIXUP_BFEXT_START(4), + UA_FIXUP_BFEXT_END(4)) | + jit_x1_fnop(); + } else if (load_store_size == 2) { + if (load_store_signed) + frag.insn[n++] = + jit_x0_bfexts( + rd, rd, + UA_FIXUP_BFEXT_START(2), + UA_FIXUP_BFEXT_END(2)) | + jit_x1_fnop(); + else + frag.insn[n++] = + jit_x0_bfextu( + rd, rd, + UA_FIXUP_BFEXT_START(2), + UA_FIXUP_BFEXT_END(2)) | + jit_x1_fnop(); + } + + frag.insn[n++] = jit_x0_fnop() | jit_x1_iret(); + } + + /* Max JIT bundle count is 14. */ + WARN_ON(n > 14); + + if (!unexpected) { + int status = 0; + int idx = (regs->pc >> 3) & + ((1ULL << (PAGE_SHIFT - UNALIGN_JIT_SHIFT)) - 1); + + frag.pc = regs->pc; + frag.bundle = bundle; + + if (unaligned_printk) { + pr_info("%s/%d, Unalign fixup: pc=%lx " + "bundle=%lx %d %d %d %d %d %d %d %d.", + current->comm, current->pid, + (unsigned long)frag.pc, + (unsigned long)frag.bundle, + (int)alias, (int)rd, (int)ra, + (int)rb, (int)bundle_2_enable, + (int)y1_lr, (int)y1_br, (int)x1_add); + + for (k = 0; k < n; k += 2) + pr_info("[%d] %016llx %016llx", k, + (unsigned long long)frag.insn[k], + (unsigned long long)frag.insn[k+1]); + } + + /* Swap bundle byte order for big endian sys. */ +#ifdef __BIG_ENDIAN + frag.bundle = GX_INSN_BSWAP(frag.bundle); + for (k = 0; k < n; k++) + frag.insn[k] = GX_INSN_BSWAP(frag.insn[k]); +#endif /* __BIG_ENDIAN */ + + status = copy_to_user((void __user *)&jit_code_area[idx], + &frag, sizeof(frag)); + if (status) { + /* Fail to copy JIT into user land. send SIGSEGV. */ + siginfo_t info = { + .si_signo = SIGSEGV, + .si_code = SEGV_MAPERR, + .si_addr = (void __user *)&jit_code_area[idx] + }; + + pr_warn("Unalign fixup: pid=%d %s jit_code_area=%llx", + current->pid, current->comm, + (unsigned long long)&jit_code_area[idx]); + + trace_unhandled_signal("segfault in unalign fixup", + regs, + (unsigned long)info.si_addr, + SIGSEGV); + force_sig_info(info.si_signo, &info, current); + return; + } + + + /* Do a cheaper increment, not accurate. */ + unaligned_fixup_count++; + __flush_icache_range((unsigned long)&jit_code_area[idx], + (unsigned long)&jit_code_area[idx] + + sizeof(frag)); + + /* Setup SPR_EX_CONTEXT_0_0/1 for returning to user program.*/ + __insn_mtspr(SPR_EX_CONTEXT_0_0, regs->pc + 8); + __insn_mtspr(SPR_EX_CONTEXT_0_1, PL_ICS_EX1(USER_PL, 0)); + + /* Modify pc at the start of new JIT. */ + regs->pc = (unsigned long)&jit_code_area[idx].insn[0]; + /* Set ICS in SPR_EX_CONTEXT_K_1. */ + regs->ex1 = PL_ICS_EX1(USER_PL, 1); + } +} + + +/* + * C function to generate unalign data JIT. Called from unalign data + * interrupt handler. + * + * First check if unalign fix is disabled or exception did not not come from + * user space or sp register points to unalign address, if true, generate a + * SIGBUS. Then map a page into user space as JIT area if it is not mapped + * yet. Genenerate JIT code by calling jit_bundle_gen(). After that return + * back to exception handler. + * + * The exception handler will "iret" to new generated JIT code after + * restoring caller saved registers. In theory, the JIT code will perform + * another "iret" to resume user's program. + */ + +void do_unaligned(struct pt_regs *regs, int vecnum) +{ + tilegx_bundle_bits __user *pc; + tilegx_bundle_bits bundle; + struct thread_info *info = current_thread_info(); + int align_ctl; + + /* Checks the per-process unaligned JIT flags */ + align_ctl = unaligned_fixup; + switch (task_thread_info(current)->align_ctl) { + case PR_UNALIGN_NOPRINT: + align_ctl = 1; + break; + case PR_UNALIGN_SIGBUS: + align_ctl = 0; + break; + } + + /* Enable iterrupt in order to access user land. */ + local_irq_enable(); + + /* + * The fault came from kernel space. Two choices: + * (a) unaligned_fixup < 1, we will first call get/put_user fixup + * to return -EFAULT. If no fixup, simply panic the kernel. + * (b) unaligned_fixup >=1, we will try to fix the unaligned access + * if it was triggered by get_user/put_user() macros. Panic the + * kernel if it is not fixable. + */ + + if (EX1_PL(regs->ex1) != USER_PL) { + + if (align_ctl < 1) { + unaligned_fixup_count++; + /* If exception came from kernel, try fix it up. */ + if (fixup_exception(regs)) { + if (unaligned_printk) + pr_info("Unalign fixup: %d %llx @%llx", + (int)unaligned_fixup, + (unsigned long long)regs->ex1, + (unsigned long long)regs->pc); + return; + } + /* Not fixable. Go panic. */ + panic("Unalign exception in Kernel. pc=%lx", + regs->pc); + return; + } else { + /* + * Try to fix the exception. If we can't, panic the + * kernel. + */ + bundle = GX_INSN_BSWAP( + *((tilegx_bundle_bits *)(regs->pc))); + jit_bundle_gen(regs, bundle, align_ctl); + return; + } + } + + /* + * Fault came from user with ICS or stack is not aligned. + * If so, we will trigger SIGBUS. + */ + if ((regs->sp & 0x7) || (regs->ex1) || (align_ctl < 0)) { + siginfo_t info = { + .si_signo = SIGBUS, + .si_code = BUS_ADRALN, + .si_addr = (unsigned char __user *)0 + }; + + if (unaligned_printk) + pr_info("Unalign fixup: %d %llx @%llx", + (int)unaligned_fixup, + (unsigned long long)regs->ex1, + (unsigned long long)regs->pc); + + unaligned_fixup_count++; + + trace_unhandled_signal("unaligned fixup trap", regs, 0, SIGBUS); + force_sig_info(info.si_signo, &info, current); + return; + } + + + /* Read the bundle casued the exception! */ + pc = (tilegx_bundle_bits __user *)(regs->pc); + if (get_user(bundle, pc) != 0) { + /* Probably never be here since pc is valid user address.*/ + siginfo_t info = { + .si_signo = SIGSEGV, + .si_code = SEGV_MAPERR, + .si_addr = (void __user *)pc + }; + pr_err("Couldn't read instruction at %p trying to step\n", pc); + trace_unhandled_signal("segfault in unalign fixup", regs, + (unsigned long)info.si_addr, SIGSEGV); + force_sig_info(info.si_signo, &info, current); + return; + } + + if (!info->unalign_jit_base) { + void __user *user_page; + + /* + * Allocate a page in userland. + * For 64-bit processes we try to place the mapping far + * from anything else that might be going on (specifically + * 64 GB below the top of the user address space). If it + * happens not to be possible to put it there, it's OK; + * the kernel will choose another location and we'll + * remember it for later. + */ + if (is_compat_task()) + user_page = NULL; + else + user_page = (void __user *)(TASK_SIZE - (1UL << 36)) + + (current->pid << PAGE_SHIFT); + + user_page = (void __user *) vm_mmap(NULL, + (unsigned long)user_page, + PAGE_SIZE, + PROT_EXEC | PROT_READ | + PROT_WRITE, +#ifdef CONFIG_HOMECACHE + MAP_CACHE_HOME_TASK | +#endif + MAP_PRIVATE | + MAP_ANONYMOUS, + 0); + + if (IS_ERR((void __force *)user_page)) { + pr_err("Out of kernel pages trying do_mmap.\n"); + return; + } + + /* Save the address in the thread_info struct */ + info->unalign_jit_base = user_page; + if (unaligned_printk) + pr_info("Unalign bundle: %d:%d, allocate page @%llx", + raw_smp_processor_id(), current->pid, + (unsigned long long)user_page); + } + + /* Generate unalign JIT */ + jit_bundle_gen(regs, GX_INSN_BSWAP(bundle), align_ctl); +} + +#endif /* __tilegx__ */ diff --git a/arch/tile/kernel/usb.c b/arch/tile/kernel/usb.c new file mode 100644 index 00000000000..5af8debc6a7 --- /dev/null +++ b/arch/tile/kernel/usb.c @@ -0,0 +1,69 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * Register the Tile-Gx USB interfaces as platform devices. + * + * The actual USB driver is just some glue (in + * drivers/usb/host/[eo]hci-tilegx.c) which makes the registers available + * to the standard kernel EHCI and OHCI drivers. + */ + +#include <linux/dma-mapping.h> +#include <linux/platform_device.h> +#include <linux/usb/tilegx.h> +#include <linux/types.h> + +static u64 ehci_dmamask = DMA_BIT_MASK(32); + +#define USB_HOST_DEF(unit, type, dmamask) \ + static struct \ + tilegx_usb_platform_data tilegx_usb_platform_data_ ## type ## \ + hci ## unit = { \ + .dev_index = unit, \ + }; \ + \ + static struct platform_device tilegx_usb_ ## type ## hci ## unit = { \ + .name = "tilegx-" #type "hci", \ + .id = unit, \ + .dev = { \ + .dma_mask = dmamask, \ + .coherent_dma_mask = DMA_BIT_MASK(32), \ + .platform_data = \ + &tilegx_usb_platform_data_ ## type ## hci ## \ + unit, \ + }, \ + }; + +USB_HOST_DEF(0, e, &ehci_dmamask) +USB_HOST_DEF(0, o, NULL) +USB_HOST_DEF(1, e, &ehci_dmamask) +USB_HOST_DEF(1, o, NULL) + +#undef USB_HOST_DEF + +static struct platform_device *tilegx_usb_devices[] __initdata = { + &tilegx_usb_ehci0, + &tilegx_usb_ehci1, + &tilegx_usb_ohci0, + &tilegx_usb_ohci1, +}; + +/** Add our set of possible USB devices. */ +static int __init tilegx_usb_init(void) +{ + platform_add_devices(tilegx_usb_devices, + ARRAY_SIZE(tilegx_usb_devices)); + + return 0; +} +arch_initcall(tilegx_usb_init); diff --git a/arch/tile/kernel/vdso.c b/arch/tile/kernel/vdso.c new file mode 100644 index 00000000000..1533af24106 --- /dev/null +++ b/arch/tile/kernel/vdso.c @@ -0,0 +1,212 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/binfmts.h> +#include <linux/compat.h> +#include <linux/elf.h> +#include <linux/mm.h> +#include <linux/pagemap.h> + +#include <asm/vdso.h> +#include <asm/mman.h> +#include <asm/sections.h> + +#include <arch/sim.h> + +/* The alignment of the vDSO. */ +#define VDSO_ALIGNMENT PAGE_SIZE + + +static unsigned int vdso_pages; +static struct page **vdso_pagelist; + +#ifdef CONFIG_COMPAT +static unsigned int vdso32_pages; +static struct page **vdso32_pagelist; +#endif +static int vdso_ready; + +/* + * The vdso data page. + */ +static union { + struct vdso_data data; + u8 page[PAGE_SIZE]; +} vdso_data_store __page_aligned_data; + +struct vdso_data *vdso_data = &vdso_data_store.data; + +static unsigned int __read_mostly vdso_enabled = 1; + +static struct page **vdso_setup(void *vdso_kbase, unsigned int pages) +{ + int i; + struct page **pagelist; + + pagelist = kzalloc(sizeof(struct page *) * (pages + 1), GFP_KERNEL); + BUG_ON(pagelist == NULL); + for (i = 0; i < pages - 1; i++) { + struct page *pg = virt_to_page(vdso_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + pagelist[i] = pg; + } + pagelist[pages - 1] = virt_to_page(vdso_data); + pagelist[pages] = NULL; + + return pagelist; +} + +static int __init vdso_init(void) +{ + int data_pages = sizeof(vdso_data_store) >> PAGE_SHIFT; + + /* + * We can disable vDSO support generally, but we need to retain + * one page to support the two-bundle (16-byte) rt_sigreturn path. + */ + if (!vdso_enabled) { + size_t offset = (unsigned long)&__vdso_rt_sigreturn; + static struct page *sigret_page; + sigret_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + BUG_ON(sigret_page == NULL); + vdso_pagelist = &sigret_page; + vdso_pages = 1; + BUG_ON(offset >= PAGE_SIZE); + memcpy(page_address(sigret_page) + offset, + vdso_start + offset, 16); +#ifdef CONFIG_COMPAT + vdso32_pages = vdso_pages; + vdso32_pagelist = vdso_pagelist; +#endif + vdso_ready = 1; + return 0; + } + + vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; + vdso_pages += data_pages; + vdso_pagelist = vdso_setup(vdso_start, vdso_pages); + +#ifdef CONFIG_COMPAT + vdso32_pages = (vdso32_end - vdso32_start) >> PAGE_SHIFT; + vdso32_pages += data_pages; + vdso32_pagelist = vdso_setup(vdso32_start, vdso32_pages); +#endif + + smp_wmb(); + vdso_ready = 1; + + return 0; +} +arch_initcall(vdso_init); + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == VDSO_BASE) + return "[vdso]"; +#ifndef __tilegx__ + if (vma->vm_start == MEM_USER_INTRPT) + return "[intrpt]"; +#endif + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} + +int in_gate_area(struct mm_struct *mm, unsigned long address) +{ + return 0; +} + +int in_gate_area_no_mm(unsigned long address) +{ + return 0; +} + +int setup_vdso_pages(void) +{ + struct page **pagelist; + unsigned long pages; + struct mm_struct *mm = current->mm; + unsigned long vdso_base = 0; + int retval = 0; + + if (!vdso_ready) + return 0; + + mm->context.vdso_base = 0; + + pagelist = vdso_pagelist; + pages = vdso_pages; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + pagelist = vdso32_pagelist; + pages = vdso32_pages; + } +#endif + + /* + * vDSO has a problem and was disabled, just don't "enable" it for the + * process. + */ + if (pages == 0) + return 0; + + vdso_base = get_unmapped_area(NULL, vdso_base, + (pages << PAGE_SHIFT) + + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), + 0, 0); + if (IS_ERR_VALUE(vdso_base)) { + retval = vdso_base; + return retval; + } + + /* Add required alignment. */ + vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); + + /* + * Put vDSO base into mm struct. We need to do this before calling + * install_special_mapping or the perf counter mmap tracking code + * will fail to recognise it as a vDSO (since arch_vma_name fails). + */ + mm->context.vdso_base = vdso_base; + + /* + * our vma flags don't have VM_WRITE so by default, the process isn't + * allowed to write those pages. + * gdb can break that with ptrace interface, and thus trigger COW on + * those pages but it's then your responsibility to never do that on + * the "data" page of the vDSO or you'll stop getting kernel updates + * and your nice userland gettimeofday will be totally dead. + * It's fine to use that for setting breakpoints in the vDSO code + * pages though + */ + retval = install_special_mapping(mm, vdso_base, + pages << PAGE_SHIFT, + VM_READ|VM_EXEC | + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, + pagelist); + if (retval) + mm->context.vdso_base = 0; + + return retval; +} + +static __init int vdso_func(char *s) +{ + return kstrtouint(s, 0, &vdso_enabled); +} +__setup("vdso=", vdso_func); diff --git a/arch/tile/kernel/vdso/Makefile b/arch/tile/kernel/vdso/Makefile new file mode 100644 index 00000000000..a025f63d54c --- /dev/null +++ b/arch/tile/kernel/vdso/Makefile @@ -0,0 +1,118 @@ +# Symbols present in the vdso +vdso-syms = rt_sigreturn gettimeofday + +# Files to link into the vdso +obj-vdso = $(patsubst %, v%.o, $(vdso-syms)) + +# Build rules +targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds +obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) + +# vdso32 is only for tilegx -m32 compat task. +VDSO32-$(CONFIG_COMPAT) := y + +obj-y += vdso.o +obj-$(VDSO32-y) += vdso32.o +extra-y += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +CFLAGS_REMOVE_vdso.o = -pg +CFLAGS_REMOVE_vdso32.o = -pg +CFLAGS_REMOVE_vrt_sigreturn.o = -pg +CFLAGS_REMOVE_vrt_sigreturn32.o = -pg +CFLAGS_REMOVE_vgettimeofday.o = -pg +CFLAGS_REMOVE_vgettimeofday32.o = -pg + +ifdef CONFIG_FEEDBACK_COLLECT +# vDSO code runs in userspace, not collecting feedback data. +CFLAGS_REMOVE_vdso.o = -ffeedback-generate +CFLAGS_REMOVE_vdso32.o = -ffeedback-generate +CFLAGS_REMOVE_vrt_sigreturn.o = -ffeedback-generate +CFLAGS_REMOVE_vrt_sigreturn32.o = -ffeedback-generate +CFLAGS_REMOVE_vgettimeofday.o = -ffeedback-generate +CFLAGS_REMOVE_vgettimeofday32.o = -ffeedback-generate +endif + +# Disable gcov profiling for VDSO code +GCOV_PROFILE := n + +# Force dependency +$(obj)/vdso.o: $(obj)/vdso.so + +# link rule for the .so file, .lds has to be first +SYSCFLAGS_vdso.so.dbg = $(c_flags) +$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) + $(call if_changed,vdsold) + + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vdso-syms.o +$(obj)/built-in.o: $(obj)/vdso-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o + +SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +SYSCFLAGS_vdso_syms.o = -r +$(obj)/vdso-syms.o: $(src)/vdso.lds $(obj)/vrt_sigreturn.o FORCE + $(call if_changed,vdsold) + + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# actual build commands +# The DSO images are built using a special linker script +# Add -lgcc so tilepro gets static muldi3 and lshrdi3 definitions. +# Make sure only to export the intended __vdso_xxx symbol offsets. +quiet_cmd_vdsold = VDSOLD $@ + cmd_vdsold = $(CC) $(KCFLAGS) -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp -lgcc && \ + $(CROSS_COMPILE)objcopy \ + $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso.so: $(obj)/vdso.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso32.so: $(obj)/vdso32.so.dbg + $(call cmd,vdso_install) + +vdso_install: vdso.so +vdso32_install: vdso32.so + + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 += -m32 -s +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 += -m32 -fPIC -shared + +obj-vdso32 = $(patsubst %, v%32.o, $(vdso-syms)) +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +targets += $(obj-vdso32) vdso32.so vdso32.so.dbg + +$(obj-vdso32:%=%): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(obj-vdso32:%=%): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + +$(obj)/vgettimeofday32.o: $(obj)/vgettimeofday.c + $(call if_changed_rule,cc_o_c) + +$(obj)/vrt_sigreturn32.o: $(obj)/vrt_sigreturn.S + $(call if_changed,as_o_S) + +# Force dependency +$(obj)/vdso32.o: $(obj)/vdso32.so + +SYSCFLAGS_vdso32.so.dbg = -m32 -shared -s -Wl,-soname=linux-vdso32.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +$(obj)/vdso32.so.dbg: $(src)/vdso.lds $(obj-vdso32) + $(call if_changed,vdsold) diff --git a/arch/tile/kernel/vdso/vdso.S b/arch/tile/kernel/vdso/vdso.S new file mode 100644 index 00000000000..3467adb4163 --- /dev/null +++ b/arch/tile/kernel/vdso/vdso.S @@ -0,0 +1,28 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .global vdso_start, vdso_end + .align PAGE_SIZE +vdso_start: + .incbin "arch/tile/kernel/vdso/vdso.so" + .align PAGE_SIZE +vdso_end: + + .previous diff --git a/arch/tile/kernel/vdso/vdso.lds.S b/arch/tile/kernel/vdso/vdso.lds.S new file mode 100644 index 00000000000..041cd6c39c8 --- /dev/null +++ b/arch/tile/kernel/vdso/vdso.lds.S @@ -0,0 +1,87 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#define VDSO_VERSION_STRING LINUX_2.6 + + +OUTPUT_ARCH(tile) + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__vdso_rt_sigreturn); + + +SECTIONS +{ + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + + /* + * This linker script is used both with -r and with -shared. + * For the layouts to match, we need to skip more than enough + * space for the dynamic symbol table et al. If this amount + * is insufficient, ld -shared will barf. Just increase it here. + */ + . = 0x1000; + .text : { *(.text .text.*) } :text + + .data : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + } +} + + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + __vdso_rt_sigreturn; + __vdso_gettimeofday; + gettimeofday; + local:*; + }; +} diff --git a/arch/tile/kernel/vdso/vdso32.S b/arch/tile/kernel/vdso/vdso32.S new file mode 100644 index 00000000000..1d1ac3257e1 --- /dev/null +++ b/arch/tile/kernel/vdso/vdso32.S @@ -0,0 +1,28 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .global vdso32_start, vdso32_end + .align PAGE_SIZE +vdso32_start: + .incbin "arch/tile/kernel/vdso/vdso32.so" + .align PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/tile/kernel/vdso/vgettimeofday.c b/arch/tile/kernel/vdso/vgettimeofday.c new file mode 100644 index 00000000000..51ec8e46f5f --- /dev/null +++ b/arch/tile/kernel/vdso/vgettimeofday.c @@ -0,0 +1,107 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#define VDSO_BUILD /* avoid some shift warnings for -m32 in <asm/page.h> */ +#include <linux/time.h> +#include <asm/timex.h> +#include <asm/vdso.h> + +#if CHIP_HAS_SPLIT_CYCLE() +static inline cycles_t get_cycles_inline(void) +{ + unsigned int high = __insn_mfspr(SPR_CYCLE_HIGH); + unsigned int low = __insn_mfspr(SPR_CYCLE_LOW); + unsigned int high2 = __insn_mfspr(SPR_CYCLE_HIGH); + + while (unlikely(high != high2)) { + low = __insn_mfspr(SPR_CYCLE_LOW); + high = high2; + high2 = __insn_mfspr(SPR_CYCLE_HIGH); + } + + return (((cycles_t)high) << 32) | low; +} +#define get_cycles get_cycles_inline +#endif + +/* + * Find out the vDSO data page address in the process address space. + */ +inline unsigned long get_datapage(void) +{ + unsigned long ret; + + /* vdso data page located in the 2nd vDSO page. */ + asm volatile ("lnk %0" : "=r"(ret)); + ret &= ~(PAGE_SIZE - 1); + ret += PAGE_SIZE; + + return ret; +} + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + cycles_t cycles; + unsigned long count, sec, ns; + volatile struct vdso_data *vdso_data; + + vdso_data = (struct vdso_data *)get_datapage(); + /* The use of the timezone is obsolete, normally tz is NULL. */ + if (unlikely(tz != NULL)) { + while (1) { + /* Spin until the update finish. */ + count = vdso_data->tz_update_count; + if (count & 1) + continue; + + tz->tz_minuteswest = vdso_data->tz_minuteswest; + tz->tz_dsttime = vdso_data->tz_dsttime; + + /* Check whether updated, read again if so. */ + if (count == vdso_data->tz_update_count) + break; + } + } + + if (unlikely(tv == NULL)) + return 0; + + while (1) { + /* Spin until the update finish. */ + count = vdso_data->tb_update_count; + if (count & 1) + continue; + + cycles = (get_cycles() - vdso_data->xtime_tod_stamp); + ns = (cycles * vdso_data->mult) >> vdso_data->shift; + sec = vdso_data->xtime_clock_sec; + ns += vdso_data->xtime_clock_nsec; + if (ns >= NSEC_PER_SEC) { + ns -= NSEC_PER_SEC; + sec += 1; + } + + /* Check whether updated, read again if so. */ + if (count == vdso_data->tb_update_count) + break; + } + + tv->tv_sec = sec; + tv->tv_usec = ns / 1000; + + return 0; +} + +int gettimeofday(struct timeval *tv, struct timezone *tz) + __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/tile/kernel/vdso/vrt_sigreturn.S b/arch/tile/kernel/vdso/vrt_sigreturn.S new file mode 100644 index 00000000000..6326caf4a03 --- /dev/null +++ b/arch/tile/kernel/vdso/vrt_sigreturn.S @@ -0,0 +1,30 @@ +/* + * Copyright 2012 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/linkage.h> +#include <arch/abi.h> +#include <asm/unistd.h> + +/* + * Note that libc has a copy of this function that it uses to compare + * against the PC when a stack backtrace ends, so if this code is + * changed, the libc implementation(s) should also be updated. + */ +ENTRY(__vdso_rt_sigreturn) + moveli TREG_SYSCALL_NR_NAME, __NR_rt_sigreturn + swint1 + /* We don't use ENDPROC to avoid tagging this symbol as FUNC, + * which confuses the perf tool. + */ + END(__vdso_rt_sigreturn) diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 631f10de12f..f1819423ffc 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -5,7 +5,7 @@ #include <hv/hypervisor.h> /* Text loads starting from the supervisor interrupt vector address. */ -#define TEXT_OFFSET MEM_SV_INTRPT +#define TEXT_OFFSET MEM_SV_START OUTPUT_ARCH(tile) ENTRY(_start) @@ -13,7 +13,7 @@ jiffies = jiffies_64; PHDRS { - intrpt1 PT_LOAD ; + intrpt PT_LOAD ; text PT_LOAD ; data PT_LOAD ; } @@ -24,23 +24,30 @@ SECTIONS #define LOAD_OFFSET TEXT_OFFSET /* Interrupt vectors */ - .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */ + .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */ { _text = .; - _stext = .; - *(.intrpt1) - } :intrpt1 =0 + *(.intrpt) + } :intrpt =0 /* Hypervisor call vectors */ - #include "hvglue.lds" + . = ALIGN(0x10000); + .hvglue : AT (ADDR(.hvglue) - LOAD_OFFSET) { + *(.hvglue) + } :NONE /* Now the real code */ . = ALIGN(0x20000); + _stext = .; .text : AT (ADDR(.text) - LOAD_OFFSET) { HEAD_TEXT SCHED_TEXT LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT __fix_text_end = .; /* tile-cpack won't rearrange before this */ + ALIGN_FUNCTION(); + *(.hottext*) TEXT_TEXT *(.text.*) *(.coldtext*) @@ -58,27 +65,17 @@ SECTIONS #define LOAD_OFFSET PAGE_OFFSET . = ALIGN(PAGE_SIZE); + __init_begin = .; VMLINUX_SYMBOL(_sinitdata) = .; INIT_DATA_SECTION(16) :data =0 PERCPU_SECTION(L2_CACHE_BYTES) . = ALIGN(PAGE_SIZE); VMLINUX_SYMBOL(_einitdata) = .; + __init_end = .; _sdata = .; /* Start of data section */ - RO_DATA_SECTION(PAGE_SIZE) - - /* initially writeable, then read-only */ - . = ALIGN(PAGE_SIZE); - __w1data_begin = .; - .w1data : AT(ADDR(.w1data) - LOAD_OFFSET) { - VMLINUX_SYMBOL(__w1data_begin) = .; - *(.w1data) - VMLINUX_SYMBOL(__w1data_end) = .; - } - RW_DATA_SECTION(L2_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) - _edata = .; EXCEPTION_TABLE(L2_CACHE_BYTES) |
