diff options
Diffstat (limited to 'arch/x86/kernel/ptrace.c')
| -rw-r--r-- | arch/x86/kernel/ptrace.c | 382 | 
1 files changed, 251 insertions, 131 deletions
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45892dc4b72..678c0ada3b3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,18 +21,22 @@  #include <linux/signal.h>  #include <linux/perf_event.h>  #include <linux/hw_breakpoint.h> +#include <linux/rcupdate.h> +#include <linux/export.h> +#include <linux/context_tracking.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h> -#include <asm/system.h>  #include <asm/processor.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/debugreg.h>  #include <asm/ldt.h>  #include <asm/desc.h>  #include <asm/prctl.h>  #include <asm/proto.h>  #include <asm/hw_breakpoint.h> +#include <asm/traps.h>  #include "tls.h" @@ -164,6 +168,35 @@ static inline bool invalid_selector(u16 value)  #define FLAG_MASK		FLAG_MASK_32 +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps.  The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ +	unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); +	unsigned long sp = (unsigned long)®s->sp; +	u32 *prev_esp; + +	if (context == (sp & ~(THREAD_SIZE - 1))) +		return sp; + +	prev_esp = (u32 *)(context); +	if (prev_esp) +		return (unsigned long)prev_esp; + +	return (unsigned long)regs; +} +EXPORT_SYMBOL_GPL(kernel_stack_pointer); +  static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)  {  	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); @@ -528,7 +561,7 @@ static int genregs_set(struct task_struct *target,  	return ret;  } -static void ptrace_triggered(struct perf_event *bp, int nmi, +static void ptrace_triggered(struct perf_event *bp,  			     struct perf_sample_data *data,  			     struct pt_regs *regs)  { @@ -568,30 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])  	return dr7;  } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, -			 struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, +					int len, int type, bool disabled) +{ +	int err, bp_len, bp_type; + +	err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); +	if (!err) { +		attr->bp_len = bp_len; +		attr->bp_type = bp_type; +		attr->disabled = disabled; +	} + +	return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, +				unsigned long addr, bool disabled)  { -	int err; -	int gen_len, gen_type;  	struct perf_event_attr attr; +	int err; -	/* -	 * We should have at least an inactive breakpoint at this -	 * slot. It means the user is writing dr7 without having -	 * written the address register first -	 */ -	if (!bp) -		return -EINVAL; +	ptrace_breakpoint_init(&attr); +	attr.bp_addr = addr; -	err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); +	err = ptrace_fill_bp_fields(&attr, len, type, disabled);  	if (err) -		return err; +		return ERR_PTR(err); -	attr = bp->attr; -	attr.bp_len = gen_len; -	attr.bp_type = gen_type; -	attr.disabled = disabled; +	return register_user_hw_breakpoint(&attr, ptrace_triggered, +						 NULL, tsk); +} + +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, +					int disabled) +{ +	struct perf_event_attr attr = bp->attr; +	int err; + +	err = ptrace_fill_bp_fields(&attr, len, type, disabled); +	if (err) +		return err;  	return modify_user_hw_breakpoint(bp, &attr);  } @@ -601,61 +652,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,   */  static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)  { -	struct thread_struct *thread = &(tsk->thread); +	struct thread_struct *thread = &tsk->thread;  	unsigned long old_dr7; -	int i, orig_ret = 0, rc = 0; -	int enabled, second_pass = 0; -	unsigned len, type; -	struct perf_event *bp; +	bool second_pass = false; +	int i, rc, ret = 0;  	data &= ~DR_CONTROL_RESERVED;  	old_dr7 = ptrace_get_dr7(thread->ptrace_bps); +  restore: -	/* -	 * Loop through all the hardware breakpoints, making the -	 * appropriate changes to each. -	 */ +	rc = 0;  	for (i = 0; i < HBP_NUM; i++) { -		enabled = decode_dr7(data, i, &len, &type); -		bp = thread->ptrace_bps[i]; - -		if (!enabled) { -			if (bp) { -				/* -				 * Don't unregister the breakpoints right-away, -				 * unless all register_user_hw_breakpoint() -				 * requests have succeeded. This prevents -				 * any window of opportunity for debug -				 * register grabbing by other users. -				 */ -				if (!second_pass) -					continue; - -				rc = ptrace_modify_breakpoint(bp, len, type, -							      tsk, 1); -				if (rc) -					break; +		unsigned len, type; +		bool disabled = !decode_dr7(data, i, &len, &type); +		struct perf_event *bp = thread->ptrace_bps[i]; + +		if (!bp) { +			if (disabled) +				continue; + +			bp = ptrace_register_breakpoint(tsk, +					len, type, 0, disabled); +			if (IS_ERR(bp)) { +				rc = PTR_ERR(bp); +				break;  			} + +			thread->ptrace_bps[i] = bp;  			continue;  		} -		rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); +		rc = ptrace_modify_breakpoint(bp, len, type, disabled);  		if (rc)  			break;  	} -	/* -	 * Make a second pass to free the remaining unused breakpoints -	 * or to restore the original breakpoints if an error occurred. -	 */ -	if (!second_pass) { -		second_pass = 1; -		if (rc < 0) { -			orig_ret = rc; -			data = old_dr7; -		} + +	/* Restore if the first pass failed, second_pass shouldn't fail. */ +	if (rc && !WARN_ON(second_pass)) { +		ret = rc; +		data = old_dr7; +		second_pass = true;  		goto restore;  	} -	return ((orig_ret < 0) ? orig_ret : rc); + +	return ret;  }  /* @@ -663,18 +703,17 @@ restore:   */  static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)  { -	struct thread_struct *thread = &(tsk->thread); +	struct thread_struct *thread = &tsk->thread;  	unsigned long val = 0;  	if (n < HBP_NUM) { -		struct perf_event *bp; -		bp = thread->ptrace_bps[n]; -		if (!bp) -			return 0; -		val = bp->hw.info.address; +		struct perf_event *bp = thread->ptrace_bps[n]; + +		if (bp) +			val = bp->hw.info.address;  	} else if (n == 6) {  		val = thread->debugreg6; -	 } else if (n == 7) { +	} else if (n == 7) {  		val = thread->ptrace_dr7;  	}  	return val; @@ -683,24 +722,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)  static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,  				      unsigned long addr)  { -	struct perf_event *bp;  	struct thread_struct *t = &tsk->thread; -	struct perf_event_attr attr; - -	if (!t->ptrace_bps[nr]) { -		ptrace_breakpoint_init(&attr); -		/* -		 * Put stub len and type to register (reserve) an inactive but -		 * correct bp -		 */ -		attr.bp_addr = addr; -		attr.bp_len = HW_BREAKPOINT_LEN_1; -		attr.bp_type = HW_BREAKPOINT_W; -		attr.disabled = 1; - -		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); +	struct perf_event *bp = t->ptrace_bps[nr]; +	int err = 0; +	if (!bp) {  		/* +		 * Put stub len and type to create an inactive but correct bp. +		 *  		 * CHECKME: the previous code returned -EIO if the addr wasn't  		 * a valid task virtual addr. The new one will return -EINVAL in  		 *  this case. @@ -709,55 +738,43 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,  		 * writing for the user. And anyway this is the previous  		 * behaviour.  		 */ +		bp = ptrace_register_breakpoint(tsk, +				X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, +				addr, true);  		if (IS_ERR(bp)) -			return PTR_ERR(bp); - -		t->ptrace_bps[nr] = bp; +			err = PTR_ERR(bp); +		else +			t->ptrace_bps[nr] = bp;  	} else { -		int err; +		struct perf_event_attr attr = bp->attr; -		bp = t->ptrace_bps[nr]; - -		attr = bp->attr;  		attr.bp_addr = addr;  		err = modify_user_hw_breakpoint(bp, &attr); -		if (err) -			return err;  	} - -	return 0; +	return err;  }  /*   * Handle PTRACE_POKEUSR calls for the debug register area.   */ -int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +static int ptrace_set_debugreg(struct task_struct *tsk, int n, +			       unsigned long val)  { -	struct thread_struct *thread = &(tsk->thread); -	int rc = 0; - +	struct thread_struct *thread = &tsk->thread;  	/* There are no DR4 or DR5 registers */ -	if (n == 4 || n == 5) -		return -EIO; +	int rc = -EIO; -	if (n == 6) { -		thread->debugreg6 = val; -		goto ret_path; -	}  	if (n < HBP_NUM) {  		rc = ptrace_set_breakpoint_addr(tsk, n, val); -		if (rc) -			return rc; -	} -	/* All that's left is DR7 */ -	if (n == 7) { +	} else if (n == 6) { +		thread->debugreg6 = val; +		rc = 0; +	} else if (n == 7) {  		rc = ptrace_write_dr7(tsk, val);  		if (!rc)  			thread->ptrace_dr7 = val;  	} - -ret_path:  	return rc;  } @@ -1112,6 +1129,94 @@ static int genregs32_set(struct task_struct *target,  	return ret;  } +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, +			    compat_long_t request, compat_ulong_t caddr, +			    compat_ulong_t cdata) +{ +	unsigned long addr = caddr; +	unsigned long data = cdata; +	void __user *datap = compat_ptr(data); +	int ret; + +	switch (request) { +	/* Read 32bits at location addr in the USER area.  Only allow +	   to return the lower 32bits of segment and debug registers.  */ +	case PTRACE_PEEKUSR: { +		u32 tmp; + +		ret = -EIO; +		if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || +		    addr < offsetof(struct user_regs_struct, cs)) +			break; + +		tmp = 0;  /* Default return condition */ +		if (addr < sizeof(struct user_regs_struct)) +			tmp = getreg(child, addr); +		else if (addr >= offsetof(struct user, u_debugreg[0]) && +			 addr <= offsetof(struct user, u_debugreg[7])) { +			addr -= offsetof(struct user, u_debugreg[0]); +			tmp = ptrace_get_debugreg(child, addr / sizeof(data)); +		} +		ret = put_user(tmp, (__u32 __user *)datap); +		break; +	} + +	/* Write the word at location addr in the USER area.  Only allow +	   to update segment and debug registers with the upper 32bits +	   zero-extended. */ +	case PTRACE_POKEUSR: +		ret = -EIO; +		if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || +		    addr < offsetof(struct user_regs_struct, cs)) +			break; + +		if (addr < sizeof(struct user_regs_struct)) +			ret = putreg(child, addr, data); +		else if (addr >= offsetof(struct user, u_debugreg[0]) && +			 addr <= offsetof(struct user, u_debugreg[7])) { +			addr -= offsetof(struct user, u_debugreg[0]); +			ret = ptrace_set_debugreg(child, +						  addr / sizeof(data), data); +		} +		break; + +	case PTRACE_GETREGS:	/* Get all gp regs from the child. */ +		return copy_regset_to_user(child, +					   task_user_regset_view(current), +					   REGSET_GENERAL, +					   0, sizeof(struct user_regs_struct), +					   datap); + +	case PTRACE_SETREGS:	/* Set all gp regs in the child. */ +		return copy_regset_from_user(child, +					     task_user_regset_view(current), +					     REGSET_GENERAL, +					     0, sizeof(struct user_regs_struct), +					     datap); + +	case PTRACE_GETFPREGS:	/* Get the child FPU state. */ +		return copy_regset_to_user(child, +					   task_user_regset_view(current), +					   REGSET_FP, +					   0, sizeof(struct user_i387_struct), +					   datap); + +	case PTRACE_SETFPREGS:	/* Set the child FPU state. */ +		return copy_regset_from_user(child, +					     task_user_regset_view(current), +					     REGSET_FP, +					     0, sizeof(struct user_i387_struct), +					     datap); + +	default: +		return compat_ptrace_request(child, request, addr, data); +	} + +	return ret; +} +#endif +  long compat_arch_ptrace(struct task_struct *child, compat_long_t request,  			compat_ulong_t caddr, compat_ulong_t cdata)  { @@ -1121,6 +1226,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,  	int ret;  	__u32 val; +#ifdef CONFIG_X86_X32_ABI +	if (!is_ia32_task()) +		return x32_arch_ptrace(child, request, caddr, cdata); +#endif +  	switch (request) {  	case PTRACE_PEEKUSR:  		ret = getreg32(child, addr, &val); @@ -1220,9 +1330,6 @@ static const struct user_regset_view user_x86_64_view = {  #define genregs32_get		genregs_get  #define genregs32_set		genregs_set -#define user_i387_ia32_struct	user_i387_struct -#define user32_fxsr_struct	user_fxsr_struct -  #endif	/* CONFIG_X86_64 */  #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1308,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,  				int error_code, int si_code,  				struct siginfo *info)  { -	tsk->thread.trap_no = 1; +	tsk->thread.trap_nr = X86_TRAP_DB;  	tsk->thread.error_code = error_code;  	memset(info, 0, sizeof(*info)); @@ -1347,10 +1454,12 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,   * We must return the syscall number to actually look up in the table.   * This can be -1L to skip running any syscall at all.   */ -asmregparm long syscall_trace_enter(struct pt_regs *regs) +long syscall_trace_enter(struct pt_regs *regs)  {  	long ret = 0; +	user_exit(); +  	/*  	 * If we stepped into a sysenter/syscall insn, it trapped in  	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. @@ -1362,7 +1471,11 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)  		regs->flags |= X86_EFLAGS_TF;  	/* do the secure computing check first */ -	secure_computing(regs->orig_ax); +	if (secure_computing(regs->orig_ax)) { +		/* seccomp failures shouldn't expose any additional code. */ +		ret = -1L; +		goto out; +	}  	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))  		ret = -1L; @@ -1374,30 +1487,35 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)  	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))  		trace_sys_enter(regs, regs->orig_ax); -	if (unlikely(current->audit_context)) { -		if (IS_IA32) -			audit_syscall_entry(AUDIT_ARCH_I386, -					    regs->orig_ax, -					    regs->bx, regs->cx, -					    regs->dx, regs->si); +	if (IS_IA32) +		audit_syscall_entry(AUDIT_ARCH_I386, +				    regs->orig_ax, +				    regs->bx, regs->cx, +				    regs->dx, regs->si);  #ifdef CONFIG_X86_64 -		else -			audit_syscall_entry(AUDIT_ARCH_X86_64, -					    regs->orig_ax, -					    regs->di, regs->si, -					    regs->dx, regs->r10); +	else +		audit_syscall_entry(AUDIT_ARCH_X86_64, +				    regs->orig_ax, +				    regs->di, regs->si, +				    regs->dx, regs->r10);  #endif -	} +out:  	return ret ?: regs->orig_ax;  } -asmregparm void syscall_trace_leave(struct pt_regs *regs) +void syscall_trace_leave(struct pt_regs *regs)  {  	bool step; -	if (unlikely(current->audit_context)) -		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); +	/* +	 * We may come here right after calling schedule_user() +	 * or do_notify_resume(), in which case we can be in RCU +	 * user mode. +	 */ +	user_exit(); + +	audit_syscall_exit(regs);  	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))  		trace_sys_exit(regs, regs->ax); @@ -1412,4 +1530,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)  			!test_thread_flag(TIF_SYSCALL_EMU);  	if (step || test_thread_flag(TIF_SYSCALL_TRACE))  		tracehook_report_syscall_exit(regs, step); + +	user_enter();  }  | 
