diff options
Diffstat (limited to 'arch/x86/kernel/vsyscall_64.c')
| -rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 438 | 
1 files changed, 239 insertions, 199 deletions
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c4b69..ea5b5709aa7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -2,6 +2,8 @@   *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE   *  Copyright 2003 Andi Kleen, SuSE Labs.   * + *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + *   *  Thanks to hpa@transmeta.com for some useful hint.   *  Special thanks to Ingo Molnar for his early experience with   *  a different vsyscall implementation for Linux/IA32 and for the name. @@ -11,14 +13,12 @@   *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid   *  jumping out of line if necessary. We cannot add more with this   *  mechanism because older kernels won't return -ENOSYS. - *  If we want more than four we need a vDSO.   * - *  Note: the concept clashes with user mode linux. If you use UML and - *  want per guest time just set the kernel.vsyscall64 sysctl to 0. + *  Note: the concept clashes with user mode linux.  UML users should + *  use the vDSO.   */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/time.h>  #include <linux/init.h> @@ -27,14 +27,18 @@  #include <linux/seqlock.h>  #include <linux/jiffies.h>  #include <linux/sysctl.h> -#include <linux/clocksource.h> +#include <linux/topology.h> +#include <linux/timekeeper_internal.h>  #include <linux/getcpu.h>  #include <linux/cpu.h>  #include <linux/smp.h>  #include <linux/notifier.h> +#include <linux/syscalls.h> +#include <linux/ratelimit.h>  #include <asm/vsyscall.h>  #include <asm/pgtable.h> +#include <asm/compat.h>  #include <asm/page.h>  #include <asm/unistd.h>  #include <asm/fixmap.h> @@ -43,215 +47,247 @@  #include <asm/segment.h>  #include <asm/desc.h>  #include <asm/topology.h> -#include <asm/vgtod.h> +#include <asm/traps.h> -#define __vsyscall(nr) \ -		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace -#define __syscall_clobber "r11","cx","memory" +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" -/* - * vsyscall_gtod_data contains data that is : - * - readonly from vsyscalls - * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) - * Try to keep this structure as small as possible to avoid cache line ping pongs - */ -int __vgetcpu_mode __section_vgetcpu_mode; +DEFINE_VVAR(int, vgetcpu_mode); -struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = -{ -	.lock = SEQLOCK_UNLOCKED, -	.sysctl_enabled = 1, -}; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; -void update_vsyscall_tz(void) +static int __init vsyscall_setup(char *str)  { -	unsigned long flags; +	if (str) { +		if (!strcmp("emulate", str)) +			vsyscall_mode = EMULATE; +		else if (!strcmp("native", str)) +			vsyscall_mode = NATIVE; +		else if (!strcmp("none", str)) +			vsyscall_mode = NONE; +		else +			return -EINVAL; + +		return 0; +	} -	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); -	/* sys_tz has changed */ -	vsyscall_gtod_data.sys_tz = sys_tz; -	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +	return -EINVAL;  } +early_param("vsyscall", vsyscall_setup); -void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, -			struct clocksource *clock, u32 mult) +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, +			      const char *message)  { -	unsigned long flags; - -	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); -	/* copy vsyscall data */ -	vsyscall_gtod_data.clock.vread = clock->vread; -	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; -	vsyscall_gtod_data.clock.mask = clock->mask; -	vsyscall_gtod_data.clock.mult = mult; -	vsyscall_gtod_data.clock.shift = clock->shift; -	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; -	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; -	vsyscall_gtod_data.wall_to_monotonic = *wtm; -	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); -	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); -} +	if (!show_unhandled_signals) +		return; -/* RED-PEN may want to readd seq locking, but then the variable should be - * write-once. - */ -static __always_inline void do_get_tz(struct timezone * tz) -{ -	*tz = __vsyscall_gtod_data.sys_tz; +	pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", +			      level, current->comm, task_pid_nr(current), +			      message, regs->ip, regs->cs, +			      regs->sp, regs->ax, regs->si, regs->di);  } -static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) +static int addr_to_vsyscall_nr(unsigned long addr)  { -	int ret; -	asm volatile("syscall" -		: "=a" (ret) -		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz) -		: __syscall_clobber ); -	return ret; +	int nr; + +	if ((addr & ~0xC00UL) != VSYSCALL_ADDR) +		return -EINVAL; + +	nr = (addr & 0xC00UL) >> 10; +	if (nr >= 3) +		return -EINVAL; + +	return nr;  } -static __always_inline long time_syscall(long *t) +static bool write_ok_or_segv(unsigned long ptr, size_t size)  { -	long secs; -	asm volatile("syscall" -		: "=a" (secs) -		: "0" (__NR_time),"D" (t) : __syscall_clobber); -	return secs; +	/* +	 * XXX: if access_ok, get_user, and put_user handled +	 * sig_on_uaccess_error, this could go away. +	 */ + +	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { +		siginfo_t info; +		struct thread_struct *thread = ¤t->thread; + +		thread->error_code	= 6;  /* user fault, no page, write */ +		thread->cr2		= ptr; +		thread->trap_nr		= X86_TRAP_PF; + +		memset(&info, 0, sizeof(info)); +		info.si_signo		= SIGSEGV; +		info.si_errno		= 0; +		info.si_code		= SEGV_MAPERR; +		info.si_addr		= (void __user *)ptr; + +		force_sig_info(SIGSEGV, &info, current); +		return false; +	} else { +		return true; +	}  } -static __always_inline void do_vgettimeofday(struct timeval * tv) +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  { -	cycle_t now, base, mask, cycle_delta; -	unsigned seq; -	unsigned long mult, shift, nsec; -	cycle_t (*vread)(void); -	do { -		seq = read_seqbegin(&__vsyscall_gtod_data.lock); - -		vread = __vsyscall_gtod_data.clock.vread; -		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { -			gettimeofday(tv,NULL); -			return; -		} - -		now = vread(); -		base = __vsyscall_gtod_data.clock.cycle_last; -		mask = __vsyscall_gtod_data.clock.mask; -		mult = __vsyscall_gtod_data.clock.mult; -		shift = __vsyscall_gtod_data.clock.shift; +	struct task_struct *tsk; +	unsigned long caller; +	int vsyscall_nr, syscall_nr, tmp; +	int prev_sig_on_uaccess_error; +	long ret; + +	/* +	 * No point in checking CS -- the only way to get here is a user mode +	 * trap to a high address, which means that we're in 64-bit user code. +	 */ + +	WARN_ON_ONCE(address != regs->ip); + +	if (vsyscall_mode == NONE) { +		warn_bad_vsyscall(KERN_INFO, regs, +				  "vsyscall attempted with vsyscall=none"); +		return false; +	} -		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; -		nsec = __vsyscall_gtod_data.wall_time_nsec; -	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); +	vsyscall_nr = addr_to_vsyscall_nr(address); -	/* calculate interval: */ -	cycle_delta = (now - base) & mask; -	/* convert to nsecs: */ -	nsec += (cycle_delta * mult) >> shift; +	trace_emulate_vsyscall(vsyscall_nr); -	while (nsec >= NSEC_PER_SEC) { -		tv->tv_sec += 1; -		nsec -= NSEC_PER_SEC; +	if (vsyscall_nr < 0) { +		warn_bad_vsyscall(KERN_WARNING, regs, +				  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); +		goto sigsegv;  	} -	tv->tv_usec = nsec / NSEC_PER_USEC; -} -int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ -	if (tv) -		do_vgettimeofday(tv); -	if (tz) -		do_get_tz(tz); -	return 0; -} +	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { +		warn_bad_vsyscall(KERN_WARNING, regs, +				  "vsyscall with bad stack (exploit attempt?)"); +		goto sigsegv; +	} -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -time_t __vsyscall(1) vtime(time_t *t) -{ -	unsigned seq; -	time_t result; -	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) -		return time_syscall(t); +	tsk = current; + +	/* +	 * Check for access_ok violations and find the syscall nr. +	 * +	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and +	 * 64-bit, so we don't need to special-case it here.  For all the +	 * vsyscalls, NULL means "don't write anything" not "write it at +	 * address 0". +	 */ +	switch (vsyscall_nr) { +	case 0: +		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || +		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) { +			ret = -EFAULT; +			goto check_fault; +		} -	do { -		seq = read_seqbegin(&__vsyscall_gtod_data.lock); +		syscall_nr = __NR_gettimeofday; +		break; -		result = __vsyscall_gtod_data.wall_time_sec; +	case 1: +		if (!write_ok_or_segv(regs->di, sizeof(time_t))) { +			ret = -EFAULT; +			goto check_fault; +		} -	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); +		syscall_nr = __NR_time; +		break; -	if (t) -		*t = result; -	return result; -} +	case 2: +		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || +		    !write_ok_or_segv(regs->si, sizeof(unsigned))) { +			ret = -EFAULT; +			goto check_fault; +		} -/* Fast way to get current CPU and node. -   This helps to do per node and per CPU caches in user space. -   The result is not guaranteed without CPU affinity, but usually -   works out because the scheduler tries to keep a thread on the same -   CPU. +		syscall_nr = __NR_getcpu; +		break; +	} -   tcache must point to a two element sized long array. -   All arguments can be NULL. */ -long __vsyscall(2) -vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ -	unsigned int p; -	unsigned long j = 0; - -	/* Fast cache - only recompute value once per jiffies and avoid -	   relatively costly rdtscp/cpuid otherwise. -	   This works because the scheduler usually keeps the process -	   on the same CPU and this syscall doesn't guarantee its -	   results anyways. -	   We do this here because otherwise user space would do it on -	   its own in a likely inferior way (no access to jiffies). -	   If you don't like it pass NULL. */ -	if (tcache && tcache->blob[0] == (j = __jiffies)) { -		p = tcache->blob[1]; -	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) { -		/* Load per CPU data from RDTSCP */ -		native_read_tscp(&p); -	} else { -		/* Load per CPU data from GDT */ -		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); +	/* +	 * Handle seccomp.  regs->ip must be the original value. +	 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. +	 * +	 * We could optimize the seccomp disabled case, but performance +	 * here doesn't matter. +	 */ +	regs->orig_ax = syscall_nr; +	regs->ax = -ENOSYS; +	tmp = secure_computing(syscall_nr); +	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { +		warn_bad_vsyscall(KERN_DEBUG, regs, +				  "seccomp tried to change syscall nr or ip"); +		do_exit(SIGSYS);  	} -	if (tcache) { -		tcache->blob[0] = j; -		tcache->blob[1] = p; +	if (tmp) +		goto do_ret;  /* skip requested */ + +	/* +	 * With a real vsyscall, page faults cause SIGSEGV.  We want to +	 * preserve that behavior to make writing exploits harder. +	 */ +	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; +	current_thread_info()->sig_on_uaccess_error = 1; + +	ret = -EFAULT; +	switch (vsyscall_nr) { +	case 0: +		ret = sys_gettimeofday( +			(struct timeval __user *)regs->di, +			(struct timezone __user *)regs->si); +		break; + +	case 1: +		ret = sys_time((time_t __user *)regs->di); +		break; + +	case 2: +		ret = sys_getcpu((unsigned __user *)regs->di, +				 (unsigned __user *)regs->si, +				 NULL); +		break;  	} -	if (cpu) -		*cpu = p & 0xfff; -	if (node) -		*node = p >> 12; -	return 0; -} -static long __vsyscall(3) venosys_1(void) -{ -	return -ENOSYS; -} +	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; -#ifdef CONFIG_SYSCTL -static ctl_table kernel_table2[] = { -	{ .procname = "vsyscall64", -	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), -	  .mode = 0644, -	  .proc_handler = proc_dointvec }, -	{} -}; - -static ctl_table kernel_root_table2[] = { -	{ .procname = "kernel", .mode = 0555, -	  .child = kernel_table2 }, -	{} -}; -#endif +check_fault: +	if (ret == -EFAULT) { +		/* Bad news -- userspace fed a bad pointer to a vsyscall. */ +		warn_bad_vsyscall(KERN_INFO, regs, +				  "vsyscall fault (exploit attempt?)"); + +		/* +		 * If we failed to generate a signal for any reason, +		 * generate one here.  (This should be impossible.) +		 */ +		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && +				 !sigismember(&tsk->pending.signal, SIGSEGV))) +			goto sigsegv; + +		return true;  /* Don't emulate the ret. */ +	} + +	regs->ax = ret; + +do_ret: +	/* Emulate a ret instruction. */ +	regs->ip = caller; +	regs->sp += 8; +	return true; -/* Assume __initcall executes before all user space. Hopefully kmod -   doesn't violate that. We'll find out if it does. */ -static void __cpuinit vsyscall_set_cpu(int cpu) +sigsegv: +	force_sig(SIGSEGV, current); +	return true; +} + +/* + * Assume __initcall executes before all user space. Hopefully kmod + * doesn't violate that. We'll find out if it does. + */ +static void vsyscall_set_cpu(int cpu)  {  	unsigned long d;  	unsigned long node = 0; @@ -261,54 +297,58 @@ static void __cpuinit vsyscall_set_cpu(int cpu)  	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))  		write_rdtscp_aux((node << 12) | cpu); -	/* Store cpu number in limit so that it can be loaded quickly -	   in user space in vgetcpu. -	   12 bits for the CPU and 8 bits for the node. */ +	/* +	 * Store cpu number in limit so that it can be loaded quickly +	 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) +	 */  	d = 0x0f40000000000ULL;  	d |= cpu;  	d |= (node & 0xf) << 12;  	d |= (node >> 4) << 48; +  	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);  } -static void __cpuinit cpu_vsyscall_init(void *arg) +static void cpu_vsyscall_init(void *arg)  {  	/* preemption should be already off */  	vsyscall_set_cpu(raw_smp_processor_id());  } -static int __cpuinit +static int  cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)  {  	long cpu = (long)arg; +  	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)  		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); +  	return NOTIFY_DONE;  }  void __init map_vsyscall(void)  { -	extern char __vsyscall_0; -	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); - -	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ -	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); +	extern char __vsyscall_page; +	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + +	__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, +		     vsyscall_mode == NATIVE +		     ? PAGE_KERNEL_VSYSCALL +		     : PAGE_KERNEL_VVAR); +	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != +		     (unsigned long)VSYSCALL_ADDR);  }  static int __init vsyscall_init(void)  { -	BUG_ON(((unsigned long) &vgettimeofday != -			VSYSCALL_ADDR(__NR_vgettimeofday))); -	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); -	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); -	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); -#ifdef CONFIG_SYSCTL -	register_sysctl_table(kernel_root_table2); -#endif +	cpu_notifier_register_begin(); +  	on_each_cpu(cpu_vsyscall_init, NULL, 1);  	/* notifier priority > KVM */ -	hotcpu_notifier(cpu_vsyscall_notifier, 30); +	__hotcpu_notifier(cpu_vsyscall_notifier, 30); + +	cpu_notifier_register_done(); +  	return 0;  } -  __initcall(vsyscall_init);  | 
