aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/vsyscall_64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/vsyscall_64.c')
-rw-r--r--arch/x86/kernel/vsyscall_64.c438
1 files changed, 239 insertions, 199 deletions
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..ea5b5709aa7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
+ * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,14 +13,12 @@
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary. We cannot add more with this
* mechanism because older kernels won't return -ENOSYS.
- * If we want more than four we need a vDSO.
*
- * Note: the concept clashes with user mode linux. If you use UML and
- * want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ * Note: the concept clashes with user mode linux. UML users should
+ * use the vDSO.
*/
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/time.h>
#include <linux/init.h>
@@ -27,14 +27,18 @@
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
-#include <linux/clocksource.h>
+#include <linux/topology.h>
+#include <linux/timekeeper_internal.h>
#include <linux/getcpu.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
+#include <asm/compat.h>
#include <asm/page.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
@@ -43,215 +47,247 @@
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
-#include <asm/vgtod.h>
+#include <asm/traps.h>
-#define __vsyscall(nr) \
- __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
+DEFINE_VVAR(int, vgetcpu_mode);
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
-{
- .lock = SEQLOCK_UNLOCKED,
- .sysctl_enabled = 1,
-};
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
-void update_vsyscall_tz(void)
+static int __init vsyscall_setup(char *str)
{
- unsigned long flags;
+ if (str) {
+ if (!strcmp("emulate", str))
+ vsyscall_mode = EMULATE;
+ else if (!strcmp("native", str))
+ vsyscall_mode = NATIVE;
+ else if (!strcmp("none", str))
+ vsyscall_mode = NONE;
+ else
+ return -EINVAL;
+
+ return 0;
+ }
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* sys_tz has changed */
- vsyscall_gtod_data.sys_tz = sys_tz;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+ return -EINVAL;
}
+early_param("vsyscall", vsyscall_setup);
-void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
- struct clocksource *clock, u32 mult)
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+ const char *message)
{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->vread;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
+ if (!show_unhandled_signals)
+ return;
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
- *tz = __vsyscall_gtod_data.sys_tz;
+ pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+static int addr_to_vsyscall_nr(unsigned long addr)
{
- int ret;
- asm volatile("syscall"
- : "=a" (ret)
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
- : __syscall_clobber );
- return ret;
+ int nr;
+
+ if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
+ return -EINVAL;
+
+ nr = (addr & 0xC00UL) >> 10;
+ if (nr >= 3)
+ return -EINVAL;
+
+ return nr;
}
-static __always_inline long time_syscall(long *t)
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- long secs;
- asm volatile("syscall"
- : "=a" (secs)
- : "0" (__NR_time),"D" (t) : __syscall_clobber);
- return secs;
+ /*
+ * XXX: if access_ok, get_user, and put_user handled
+ * sig_on_uaccess_error, this could go away.
+ */
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+ siginfo_t info;
+ struct thread_struct *thread = &current->thread;
+
+ thread->error_code = 6; /* user fault, no page, write */
+ thread->cr2 = ptr;
+ thread->trap_nr = X86_TRAP_PF;
+
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = (void __user *)ptr;
+
+ force_sig_info(SIGSEGV, &info, current);
+ return false;
+ } else {
+ return true;
+ }
}
-static __always_inline void do_vgettimeofday(struct timeval * tv)
+bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
- cycle_t now, base, mask, cycle_delta;
- unsigned seq;
- unsigned long mult, shift, nsec;
- cycle_t (*vread)(void);
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
- gettimeofday(tv,NULL);
- return;
- }
-
- now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
+ struct task_struct *tsk;
+ unsigned long caller;
+ int vsyscall_nr, syscall_nr, tmp;
+ int prev_sig_on_uaccess_error;
+ long ret;
+
+ /*
+ * No point in checking CS -- the only way to get here is a user mode
+ * trap to a high address, which means that we're in 64-bit user code.
+ */
+
+ WARN_ON_ONCE(address != regs->ip);
+
+ if (vsyscall_mode == NONE) {
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall attempted with vsyscall=none");
+ return false;
+ }
- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+ vsyscall_nr = addr_to_vsyscall_nr(address);
- /* calculate interval: */
- cycle_delta = (now - base) & mask;
- /* convert to nsecs: */
- nsec += (cycle_delta * mult) >> shift;
+ trace_emulate_vsyscall(vsyscall_nr);
- while (nsec >= NSEC_PER_SEC) {
- tv->tv_sec += 1;
- nsec -= NSEC_PER_SEC;
+ if (vsyscall_nr < 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
+ goto sigsegv;
}
- tv->tv_usec = nsec / NSEC_PER_USEC;
-}
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
-{
- if (tv)
- do_vgettimeofday(tv);
- if (tz)
- do_get_tz(tz);
- return 0;
-}
+ if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "vsyscall with bad stack (exploit attempt?)");
+ goto sigsegv;
+ }
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
- unsigned seq;
- time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
- return time_syscall(t);
+ tsk = current;
+
+ /*
+ * Check for access_ok violations and find the syscall nr.
+ *
+ * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, NULL means "don't write anything" not "write it at
+ * address 0".
+ */
+ switch (vsyscall_nr) {
+ case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ syscall_nr = __NR_gettimeofday;
+ break;
- result = __vsyscall_gtod_data.wall_time_sec;
+ case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+ syscall_nr = __NR_time;
+ break;
- if (t)
- *t = result;
- return result;
-}
+ case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
-/* Fast way to get current CPU and node.
- This helps to do per node and per CPU caches in user space.
- The result is not guaranteed without CPU affinity, but usually
- works out because the scheduler tries to keep a thread on the same
- CPU.
+ syscall_nr = __NR_getcpu;
+ break;
+ }
- tcache must point to a two element sized long array.
- All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
- unsigned int p;
- unsigned long j = 0;
-
- /* Fast cache - only recompute value once per jiffies and avoid
- relatively costly rdtscp/cpuid otherwise.
- This works because the scheduler usually keeps the process
- on the same CPU and this syscall doesn't guarantee its
- results anyways.
- We do this here because otherwise user space would do it on
- its own in a likely inferior way (no access to jiffies).
- If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
- p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
- /* Load per CPU data from RDTSCP */
- native_read_tscp(&p);
- } else {
- /* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ /*
+ * Handle seccomp. regs->ip must be the original value.
+ * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+ *
+ * We could optimize the seccomp disabled case, but performance
+ * here doesn't matter.
+ */
+ regs->orig_ax = syscall_nr;
+ regs->ax = -ENOSYS;
+ tmp = secure_computing(syscall_nr);
+ if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+ warn_bad_vsyscall(KERN_DEBUG, regs,
+ "seccomp tried to change syscall nr or ip");
+ do_exit(SIGSYS);
}
- if (tcache) {
- tcache->blob[0] = j;
- tcache->blob[1] = p;
+ if (tmp)
+ goto do_ret; /* skip requested */
+
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV. We want to
+ * preserve that behavior to make writing exploits harder.
+ */
+ prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+ current_thread_info()->sig_on_uaccess_error = 1;
+
+ ret = -EFAULT;
+ switch (vsyscall_nr) {
+ case 0:
+ ret = sys_gettimeofday(
+ (struct timeval __user *)regs->di,
+ (struct timezone __user *)regs->si);
+ break;
+
+ case 1:
+ ret = sys_time((time_t __user *)regs->di);
+ break;
+
+ case 2:
+ ret = sys_getcpu((unsigned __user *)regs->di,
+ (unsigned __user *)regs->si,
+ NULL);
+ break;
}
- if (cpu)
- *cpu = p & 0xfff;
- if (node)
- *node = p >> 12;
- return 0;
-}
-static long __vsyscall(3) venosys_1(void)
-{
- return -ENOSYS;
-}
+ current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
-#ifdef CONFIG_SYSCTL
-static ctl_table kernel_table2[] = {
- { .procname = "vsyscall64",
- .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec },
- {}
-};
-
-static ctl_table kernel_root_table2[] = {
- { .procname = "kernel", .mode = 0555,
- .child = kernel_table2 },
- {}
-};
-#endif
+check_fault:
+ if (ret == -EFAULT) {
+ /* Bad news -- userspace fed a bad pointer to a vsyscall. */
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall fault (exploit attempt?)");
+
+ /*
+ * If we failed to generate a signal for any reason,
+ * generate one here. (This should be impossible.)
+ */
+ if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+ !sigismember(&tsk->pending.signal, SIGSEGV)))
+ goto sigsegv;
+
+ return true; /* Don't emulate the ret. */
+ }
+
+ regs->ax = ret;
+
+do_ret:
+ /* Emulate a ret instruction. */
+ regs->ip = caller;
+ regs->sp += 8;
+ return true;
-/* Assume __initcall executes before all user space. Hopefully kmod
- doesn't violate that. We'll find out if it does. */
-static void __cpuinit vsyscall_set_cpu(int cpu)
+sigsegv:
+ force_sig(SIGSEGV, current);
+ return true;
+}
+
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
+static void vsyscall_set_cpu(int cpu)
{
unsigned long d;
unsigned long node = 0;
@@ -261,54 +297,58 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
- /* Store cpu number in limit so that it can be loaded quickly
- in user space in vgetcpu.
- 12 bits for the CPU and 8 bits for the node. */
+ /*
+ * Store cpu number in limit so that it can be loaded quickly
+ * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+ */
d = 0x0f40000000000ULL;
d |= cpu;
d |= (node & 0xf) << 12;
d |= (node >> 4) << 48;
+
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
-static void __cpuinit cpu_vsyscall_init(void *arg)
+static void cpu_vsyscall_init(void *arg)
{
/* preemption should be already off */
vsyscall_set_cpu(raw_smp_processor_id());
}
-static int __cpuinit
+static int
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
+
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
return NOTIFY_DONE;
}
void __init map_vsyscall(void)
{
- extern char __vsyscall_0;
- unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
-
- /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+ extern char __vsyscall_page;
+ unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
+ vsyscall_mode == NATIVE
+ ? PAGE_KERNEL_VSYSCALL
+ : PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+ (unsigned long)VSYSCALL_ADDR);
}
static int __init vsyscall_init(void)
{
- BUG_ON(((unsigned long) &vgettimeofday !=
- VSYSCALL_ADDR(__NR_vgettimeofday)));
- BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
- BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
- BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
- register_sysctl_table(kernel_root_table2);
-#endif
+ cpu_notifier_register_begin();
+
on_each_cpu(cpu_vsyscall_init, NULL, 1);
/* notifier priority > KVM */
- hotcpu_notifier(cpu_vsyscall_notifier, 30);
+ __hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
+ cpu_notifier_register_done();
+
return 0;
}
-
__initcall(vsyscall_init);