diff options
Diffstat (limited to 'kernel/sys.c')
| -rw-r--r-- | kernel/sys.c | 2425 |
1 files changed, 1392 insertions, 1033 deletions
diff --git a/kernel/sys.c b/kernel/sys.c index a626116af5d..66a751ebf9d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -4,19 +4,18 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/smp_lock.h> -#include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> +#include <linux/kmod.h> +#include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> -#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> @@ -33,11 +32,30 @@ #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> +#include <linux/personality.h> +#include <linux/ptrace.h> +#include <linux/fs_struct.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/gfp.h> +#include <linux/syscore_ops.h> +#include <linux/version.h> +#include <linux/ctype.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/binfmts.h> + +#include <linux/sched.h> +#include <linux/rcupdate.h> +#include <linux/uidgid.h> +#include <linux/cred.h> + +#include <linux/kmsg_dump.h> +/* Move somewhere else to avoid recompiling? */ +#include <generated/utsrelease.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -67,6 +85,12 @@ #ifndef SET_ENDIAN # define SET_ENDIAN(a,b) (-EINVAL) #endif +#ifndef GET_TSC_CTL +# define GET_TSC_CTL(a) (-EINVAL) +#endif +#ifndef SET_TSC_CTL +# define SET_TSC_CTL(a) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -76,10 +100,8 @@ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; -#ifdef CONFIG_UID16 EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); -#endif /* * the same as above, but for filesystems which can only store a 16-bit @@ -93,25 +115,32 @@ EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); /* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (uid_eq(pcred->uid, cred->euid) || + uid_eq(pcred->euid, cred->euid)) + return true; + if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) + return true; + return false; +} /* - * If set, this is used for preparing the system to power off. + * set the priority of a task + * - the caller must hold the RCU read lock */ - -void (*pm_power_off_prepare)(void); - static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; - if (p->uid != current->euid && - p->euid != current->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -131,23 +160,26 @@ out: return error; } -asmlinkage long sys_setpriority(int which, int who, int niceval) +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; + const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; + if (niceval < MIN_NICE) + niceval = MIN_NICE; + if (niceval > MAX_NICE) + niceval = MAX_NICE; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -163,28 +195,30 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->user; + uid = make_kuid(cred->user_ns, who); + user = cred->user; if (!who) - who = current->uid; - else - if ((who != current->uid) && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && + !(user = find_user(uid))) + goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) - if (p->uid == who) + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); - if (who != current->uid) + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); out: return error; } @@ -195,16 +229,19 @@ out: * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ -asmlinkage long sys_getpriority(int which, int who) +SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; + const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -213,7 +250,7 @@ asmlinkage long sys_getpriority(int which, int who) else p = current; if (p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -223,244 +260,39 @@ asmlinkage long sys_getpriority(int which, int who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->user; + uid = make_kuid(cred->user_ns, who); + user = cred->user; if (!who) - who = current->uid; - else - if ((who != current->uid) && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) - if (p->uid == who) { - niceval = 20 - task_nice(p); + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && + !(user = find_user(uid))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) { + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); - if (who != current->uid) + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -static void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - device_shutdown(); - sysdev_shutdown(); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -/** - * kernel_kexec - reboot the system - * - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -static void kernel_kexec(void) -{ -#ifdef CONFIG_KEXEC - struct kimage *image; - image = xchg(&kexec_image, NULL); - if (!image) - return; - kernel_restart_prepare(NULL); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - machine_kexec(image); -#endif -} - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - sysdev_shutdown(); - printk(KERN_EMERG "System halted.\n"); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - disable_nonboot_cpus(); - sysdev_shutdown(); - printk(KERN_EMERG "Power down.\n"); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) -{ - char buffer[256]; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - lock_kernel(); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - unlock_kernel(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - unlock_kernel(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - unlock_kernel(); - return -EFAULT; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - - case LINUX_REBOOT_CMD_KEXEC: - kernel_kexec(); - unlock_kernel(); - return -EINVAL; - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - { - int ret = hibernate(); - unlock_kernel(); - return ret; - } -#endif - - default: - unlock_kernel(); - return -EINVAL; - } - unlock_kernel(); - return 0; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -479,48 +311,56 @@ void ctrl_alt_del(void) * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ -asmlinkage long sys_setregid(gid_t rgid, gid_t egid) +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { - int old_rgid = current->gid; - int old_egid = current->egid; - int new_rgid = old_rgid; - int new_egid = old_egid; + struct user_namespace *ns = current_user_ns(); + const struct cred *old; + struct cred *new; int retval; + kgid_t krgid, kegid; - retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); - if (retval) - return retval; + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; if (rgid != (gid_t) -1) { - if ((old_rgid == rgid) || - (current->egid==rgid) || - capable(CAP_SETGID)) - new_rgid = rgid; + if (gid_eq(old->gid, krgid) || + gid_eq(old->egid, krgid) || + ns_capable(old->user_ns, CAP_SETGID)) + new->gid = krgid; else - return -EPERM; + goto error; } if (egid != (gid_t) -1) { - if ((old_rgid == egid) || - (current->egid == egid) || - (current->sgid == egid) || - capable(CAP_SETGID)) - new_egid = egid; + if (gid_eq(old->gid, kegid) || + gid_eq(old->egid, kegid) || + gid_eq(old->sgid, kegid) || + ns_capable(old->user_ns, CAP_SETGID)) + new->egid = kegid; else - return -EPERM; - } - if (new_egid != old_egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + goto error; } + if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old_rgid)) - current->sgid = new_egid; - current->fsgid = new_egid; - current->egid = new_egid; - current->gid = new_rgid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) + new->sgid = new->egid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; } /* @@ -528,58 +368,64 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) * * SMP: Same implicit races as above. */ -asmlinkage long sys_setgid(gid_t gid) +SYSCALL_DEFINE1(setgid, gid_t, gid) { - int old_egid = current->egid; + struct user_namespace *ns = current_user_ns(); + const struct cred *old; + struct cred *new; int retval; + kgid_t kgid; - retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); - if (retval) - return retval; + kgid = make_kgid(ns, gid); + if (!gid_valid(kgid)) + return -EINVAL; - if (capable(CAP_SETGID)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->gid = current->egid = current->sgid = current->fsgid = gid; - } else if ((gid == current->gid) || (gid == current->sgid)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->egid = current->fsgid = gid; - } + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (ns_capable(old->user_ns, CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = kgid; + else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) + new->egid = new->fsgid = kgid; else - return -EPERM; + goto error; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } - -static int set_user(uid_t new_ruid, int dumpclear) + +/* + * change the user struct in a credentials set to match the new UID + */ +static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); + new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; - if (atomic_read(&new_user->processes) >= - current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != current->nsproxy->user_ns->root_user) { - free_uid(new_user); - return -EAGAIN; - } - - switch_uid(new_user); + /* + * We don't fail in case of NPROC limit excess here because too many + * poorly written programs don't check set*uid() return code, assuming + * it never fails if called by root. We may still enforce NPROC limit + * for programs doing set*uid()+execve() by harmlessly deferring the + * failure to the execve() stage. + */ + if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && + new_user != INIT_USER) + current->flags |= PF_NPROC_EXCEEDED; + else + current->flags &= ~PF_NPROC_EXCEEDED; - if (dumpclear) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->uid = new_ruid; + free_uid(new->user); + new->user = new_user; return 0; } @@ -598,56 +444,65 @@ static int set_user(uid_t new_ruid, int dumpclear) * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ -asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { - int old_ruid, old_euid, old_suid, new_ruid, new_euid; + struct user_namespace *ns = current_user_ns(); + const struct cred *old; + struct cred *new; int retval; + kuid_t kruid, keuid; - retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); - if (retval) - return retval; + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); - new_ruid = old_ruid = current->uid; - new_euid = old_euid = current->euid; - old_suid = current->suid; + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; if (ruid != (uid_t) -1) { - new_ruid = ruid; - if ((old_ruid != ruid) && - (current->euid != ruid) && - !capable(CAP_SETUID)) - return -EPERM; + new->uid = kruid; + if (!uid_eq(old->uid, kruid) && + !uid_eq(old->euid, kruid) && + !ns_capable(old->user_ns, CAP_SETUID)) + goto error; } if (euid != (uid_t) -1) { - new_euid = euid; - if ((old_ruid != euid) && - (current->euid != euid) && - (current->suid != euid) && - !capable(CAP_SETUID)) - return -EPERM; + new->euid = keuid; + if (!uid_eq(old->uid, keuid) && + !uid_eq(old->euid, keuid) && + !uid_eq(old->suid, keuid) && + !ns_capable(old->user_ns, CAP_SETUID)) + goto error; } - if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) - return -EAGAIN; - - if (new_euid != old_euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + if (!uid_eq(new->uid, old->uid)) { + retval = set_user(new); + if (retval < 0) + goto error; } - current->fsuid = current->euid = new_euid; if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old_ruid)) - current->suid = current->euid; - current->fsuid = current->euid; + (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) + new->suid = new->euid; + new->fsuid = new->euid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); -} + retval = security_task_fix_setuid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + return commit_creds(new); +error: + abort_creds(new); + return retval; +} /* * setuid() is implemented like SysV with SAVED_IDS @@ -660,38 +515,46 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ -asmlinkage long sys_setuid(uid_t uid) +SYSCALL_DEFINE1(setuid, uid_t, uid) { - int old_euid = current->euid; - int old_ruid, old_suid, new_suid; + struct user_namespace *ns = current_user_ns(); + const struct cred *old; + struct cred *new; int retval; + kuid_t kuid; - retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); - if (retval) - return retval; - - old_ruid = current->uid; - old_suid = current->suid; - new_suid = old_suid; - - if (capable(CAP_SETUID)) { - if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) - return -EAGAIN; - new_suid = uid; - } else if ((uid != current->uid) && (uid != new_suid)) - return -EPERM; + kuid = make_kuid(ns, uid); + if (!uid_valid(kuid)) + return -EINVAL; - if (old_euid != uid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (ns_capable(old->user_ns, CAP_SETUID)) { + new->suid = new->uid = kuid; + if (!uid_eq(kuid, old->uid)) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { + goto error; } - current->fsuid = current->euid = uid; - current->suid = new_suid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + new->fsuid = new->euid = kuid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); + return commit_creds(new); + +error: + abort_creds(new); + return retval; } @@ -699,56 +562,84 @@ asmlinkage long sys_setuid(uid_t uid) * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ -asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { - int old_ruid = current->uid; - int old_euid = current->euid; - int old_suid = current->suid; + struct user_namespace *ns = current_user_ns(); + const struct cred *old; + struct cred *new; int retval; + kuid_t kruid, keuid, ksuid; - retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); - if (retval) - return retval; + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + ksuid = make_kuid(ns, suid); - if (!capable(CAP_SETUID)) { - if ((ruid != (uid_t) -1) && (ruid != current->uid) && - (ruid != current->euid) && (ruid != current->suid)) - return -EPERM; - if ((euid != (uid_t) -1) && (euid != current->uid) && - (euid != current->euid) && (euid != current->suid)) - return -EPERM; - if ((suid != (uid_t) -1) && (suid != current->uid) && - (suid != current->euid) && (suid != current->suid)) - return -EPERM; + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; + + if ((suid != (uid_t) -1) && !uid_valid(ksuid)) + return -EINVAL; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + old = current_cred(); + + retval = -EPERM; + if (!ns_capable(old->user_ns, CAP_SETUID)) { + if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) + goto error; + if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) + goto error; + if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) + goto error; } + if (ruid != (uid_t) -1) { - if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) - return -EAGAIN; - } - if (euid != (uid_t) -1) { - if (euid != current->euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->uid = kruid; + if (!uid_eq(kruid, old->uid)) { + retval = set_user(new); + if (retval < 0) + goto error; } - current->euid = euid; } - current->fsuid = current->euid; + if (euid != (uid_t) -1) + new->euid = keuid; if (suid != (uid_t) -1) - current->suid = suid; + new->suid = ksuid; + new->fsuid = new->euid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + retval = security_task_fix_setuid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); + return commit_creds(new); + +error: + abort_creds(new); + return retval; } -asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { + const struct cred *cred = current_cred(); int retval; + uid_t ruid, euid, suid; + + ruid = from_kuid_munged(cred->user_ns, cred->uid); + euid = from_kuid_munged(cred->user_ns, cred->euid); + suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(current->uid, ruid)) && - !(retval = put_user(current->euid, euid))) - retval = put_user(current->suid, suid); + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); return retval; } @@ -756,50 +647,71 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us /* * Same as above, but for rgid, egid, sgid. */ -asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { + struct user_namespace *ns = current_user_ns(); + const struct cred *old; + struct cred *new; int retval; + kgid_t krgid, kegid, ksgid; - retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); - if (retval) - return retval; + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + ksgid = make_kgid(ns, sgid); - if (!capable(CAP_SETGID)) { - if ((rgid != (gid_t) -1) && (rgid != current->gid) && - (rgid != current->egid) && (rgid != current->sgid)) - return -EPERM; - if ((egid != (gid_t) -1) && (egid != current->gid) && - (egid != current->egid) && (egid != current->sgid)) - return -EPERM; - if ((sgid != (gid_t) -1) && (sgid != current->gid) && - (sgid != current->egid) && (sgid != current->sgid)) - return -EPERM; - } - if (egid != (gid_t) -1) { - if (egid != current->egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->egid = egid; + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; + if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) + return -EINVAL; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (!ns_capable(old->user_ns, CAP_SETGID)) { + if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) + goto error; + if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) + goto error; + if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) + goto error; } - current->fsgid = current->egid; + if (rgid != (gid_t) -1) - current->gid = rgid; + new->gid = krgid; + if (egid != (gid_t) -1) + new->egid = kegid; if (sgid != (gid_t) -1) - current->sgid = sgid; + new->sgid = ksgid; + new->fsgid = new->egid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } -asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { + const struct cred *cred = current_cred(); int retval; + gid_t rgid, egid, sgid; + + rgid = from_kgid_munged(cred->user_ns, cred->gid); + egid = from_kgid_munged(cred->user_ns, cred->egid); + sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(current->gid, rgid)) && - !(retval = put_user(current->egid, egid))) - retval = put_user(current->sgid, sgid); + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); return retval; } @@ -811,92 +723,166 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ -asmlinkage long sys_setfsuid(uid_t uid) +SYSCALL_DEFINE1(setfsuid, uid_t, uid) { - int old_fsuid; + const struct cred *old; + struct cred *new; + uid_t old_fsuid; + kuid_t kuid; - old_fsuid = current->fsuid; - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) + old = current_cred(); + old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); + + kuid = make_kuid(old->user_ns, uid); + if (!uid_valid(kuid)) + return old_fsuid; + + new = prepare_creds(); + if (!new) return old_fsuid; - if (uid == current->uid || uid == current->euid || - uid == current->suid || uid == current->fsuid || - capable(CAP_SETUID)) { - if (uid != old_fsuid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || + uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || + ns_capable(old->user_ns, CAP_SETUID)) { + if (!uid_eq(kuid, old->fsuid)) { + new->fsuid = kuid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + goto change_okay; } - current->fsuid = uid; } - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); + abort_creds(new); + return old_fsuid; +change_okay: + commit_creds(new); return old_fsuid; } /* * Samma på svenska.. */ -asmlinkage long sys_setfsgid(gid_t gid) +SYSCALL_DEFINE1(setfsgid, gid_t, gid) { - int old_fsgid; + const struct cred *old; + struct cred *new; + gid_t old_fsgid; + kgid_t kgid; + + old = current_cred(); + old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); - old_fsgid = current->fsgid; - if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) + kgid = make_kgid(old->user_ns, gid); + if (!gid_valid(kgid)) return old_fsgid; - if (gid == current->gid || gid == current->egid || - gid == current->sgid || gid == current->fsgid || - capable(CAP_SETGID)) { - if (gid != old_fsgid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new = prepare_creds(); + if (!new) + return old_fsgid; + + if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || + gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || + ns_capable(old->user_ns, CAP_SETGID)) { + if (!gid_eq(kgid, old->fsgid)) { + new->fsgid = kgid; + goto change_okay; } - current->fsgid = gid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); } + + abort_creds(new); + return old_fsgid; + +change_okay: + commit_creds(new); return old_fsgid; } -asmlinkage long sys_times(struct tms __user * tbuf) +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +SYSCALL_DEFINE0(getpid) +{ + return task_tgid_vnr(current); +} + +/* Thread ID - the internal kernel "pid" */ +SYSCALL_DEFINE0(gettid) +{ + return task_pid_vnr(current); +} + +/* + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). + */ +SYSCALL_DEFINE0(getppid) +{ + int pid; + + rcu_read_lock(); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); + rcu_read_unlock(); + + return pid; +} + +SYSCALL_DEFINE0(getuid) +{ + /* Only we change this so SMP safe */ + return from_kuid_munged(current_user_ns(), current_uid()); +} + +SYSCALL_DEFINE0(geteuid) +{ + /* Only we change this so SMP safe */ + return from_kuid_munged(current_user_ns(), current_euid()); +} + +SYSCALL_DEFINE0(getgid) +{ + /* Only we change this so SMP safe */ + return from_kgid_munged(current_user_ns(), current_gid()); +} + +SYSCALL_DEFINE0(getegid) +{ + /* Only we change this so SMP safe */ + return from_kgid_munged(current_user_ns(), current_egid()); +} + +void do_sys_times(struct tms *tms) +{ + cputime_t tgutime, tgstime, cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(tgutime); + tms->tms_stime = cputime_to_clock_t(tgstime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + +SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); + + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } + force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } @@ -909,10 +895,9 @@ asmlinkage long sys_times(struct tms __user * tbuf) * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 + * !PF_FORKNOEXEC check to conform completely to POSIX. */ -asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) +SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; @@ -925,6 +910,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) pgid = pid; if (pgid < 0) return -EINVAL; + rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM @@ -945,7 +931,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; - if (p->did_exec) + if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; @@ -971,72 +957,94 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (err) goto out; - if (task_pgrp(p) != pgrp) { - detach_pid(p, PIDTYPE_PGID); - attach_pid(p, PIDTYPE_PGID, pgrp); - set_task_pgrp(p, pid_nr(pgrp)); - } + if (task_pgrp(p) != pgrp) + change_pid(p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); return err; } -asmlinkage long sys_getpgid(pid_t pid) +SYSCALL_DEFINE1(getpgid, pid_t, pid) { + struct task_struct *p; + struct pid *grp; + int retval; + + rcu_read_lock(); if (!pid) - return task_pgrp_vnr(current); + grp = task_pgrp(current); else { - int retval; - struct task_struct *p; - - read_lock(&tasklist_lock); - p = find_task_by_vpid(pid); retval = -ESRCH; - if (p) { - retval = security_task_getpgid(p); - if (!retval) - retval = task_pgrp_vnr(p); - } - read_unlock(&tasklist_lock); - return retval; + p = find_task_by_vpid(pid); + if (!p) + goto out; + grp = task_pgrp(p); + if (!grp) + goto out; + + retval = security_task_getpgid(p); + if (retval) + goto out; } + retval = pid_vnr(grp); +out: + rcu_read_unlock(); + return retval; } #ifdef __ARCH_WANT_SYS_GETPGRP -asmlinkage long sys_getpgrp(void) +SYSCALL_DEFINE0(getpgrp) { - /* SMP - assuming writes are word atomic this is fine */ - return task_pgrp_vnr(current); + return sys_getpgid(0); } #endif -asmlinkage long sys_getsid(pid_t pid) +SYSCALL_DEFINE1(getsid, pid_t, pid) { + struct task_struct *p; + struct pid *sid; + int retval; + + rcu_read_lock(); if (!pid) - return task_session_vnr(current); + sid = task_session(current); else { - int retval; - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); retval = -ESRCH; - if (p) { - retval = security_task_getsid(p); - if (!retval) - retval = task_session_vnr(p); - } - rcu_read_unlock(); - return retval; + p = find_task_by_vpid(pid); + if (!p) + goto out; + sid = task_session(p); + if (!sid) + goto out; + + retval = security_task_getsid(p); + if (retval) + goto out; } + retval = pid_vnr(sid); +out: + rcu_read_unlock(); + return retval; } -asmlinkage long sys_setsid(void) +static void set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + + if (task_session(curr) != pid) + change_pid(curr, PIDTYPE_SID, pid); + + if (task_pgrp(curr) != pid) + change_pid(curr, PIDTYPE_PGID, pid); +} + +SYSCALL_DEFINE0(setsid) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); @@ -1055,304 +1063,154 @@ asmlinkage long sys_setsid(void) goto out; group_leader->signal->leader = 1; - __set_special_pids(sid); + set_special_pids(sid); - spin_lock(&group_leader->sighand->siglock); - group_leader->signal->tty = NULL; - spin_unlock(&group_leader->sighand->siglock); + proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); - return err; -} - -/* - * Supplementary group IDs - */ - -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - -struct group_info *groups_alloc(int gidsetsize) -{ - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) - return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - gid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; - -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); + if (err > 0) { + proc_sid_connector(group_leader); + sched_autogroup_create_attach(group_leader); } - kfree(group_info); - return NULL; -} - -EXPORT_SYMBOL(groups_alloc); - -void groups_free(struct group_info *group_info) -{ - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); + return err; } -EXPORT_SYMBOL(groups_free); - -/* export the group_info to a user-space array */ -static int groups_to_user(gid_t __user *grouplist, - struct group_info *group_info) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_to_user(grouplist, group_info->blocks[i], len)) - return -EFAULT; +DECLARE_RWSEM(uts_sem); - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} +#ifdef COMPAT_UTS_MACHINE +#define override_architecture(name) \ + (personality(current->personality) == PER_LINUX32 && \ + copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ + sizeof(COMPAT_UTS_MACHINE))) +#else +#define override_architecture(name) 0 +#endif -/* fill a group_info from a user-space array - it must be allocated already */ -static int groups_from_user(struct group_info *group_info, - gid_t __user *grouplist) +/* + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +static int override_release(char __user *release, size_t len) { - int i; - unsigned int count = group_info->ngroups; + int ret = 0; - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); + if (current->personality & UNAME26) { + const char *rest = UTS_RELEASE; + char buf[65] = { 0 }; + int ndots = 0; + unsigned v; + size_t copy; - if (copy_from_user(group_info->blocks[i], grouplist, len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* a simple Shell sort */ -static void groups_sort(struct group_info *group_info) -{ - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - gid_t tmp = GROUP_AT(group_info, right); - - while (left >= 0 && GROUP_AT(group_info, left) > tmp) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); - right = left; - left -= stride; - } - GROUP_AT(group_info, right) = tmp; + while (*rest) { + if (*rest == '.' && ++ndots >= 3) + break; + if (!isdigit(*rest) && *rest != '.') + break; + rest++; } - stride /= 3; - } -} - -/* a simple bsearch */ -int groups_search(struct group_info *group_info, gid_t grp) -{ - unsigned int left, right; - - if (!group_info) - return 0; - - left = 0; - right = group_info->ngroups; - while (left < right) { - unsigned int mid = (left+right)/2; - int cmp = grp - GROUP_AT(group_info, mid); - if (cmp > 0) - left = mid + 1; - else if (cmp < 0) - right = mid; - else - return 1; + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + copy = clamp_t(size_t, len, 1, sizeof(buf)); + copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, copy + 1); } - return 0; -} - -/* validate and set current->group_info */ -int set_current_groups(struct group_info *group_info) -{ - int retval; - struct group_info *old_info; - - retval = security_task_setgroups(group_info); - if (retval) - return retval; - - groups_sort(group_info); - get_group_info(group_info); - - task_lock(current); - old_info = current->group_info; - current->group_info = group_info; - task_unlock(current); - - put_group_info(old_info); - - return 0; + return ret; } -EXPORT_SYMBOL(set_current_groups); - -asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) +SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { - int i = 0; - - /* - * SMP: Nobody else can change our grouplist. Thus we are - * safe. - */ + int errno = 0; - if (gidsetsize < 0) - return -EINVAL; + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); - /* no need to grab task_lock here; it cannot change */ - i = current->group_info->ngroups; - if (gidsetsize) { - if (i > gidsetsize) { - i = -EINVAL; - goto out; - } - if (groups_to_user(grouplist, current->group_info)) { - i = -EFAULT; - goto out; - } - } -out: - return i; + if (!errno && override_release(name->release, sizeof(name->release))) + errno = -EFAULT; + if (!errno && override_architecture(name)) + errno = -EFAULT; + return errno; } +#ifdef __ARCH_WANT_SYS_OLD_UNAME /* - * SMP: Our groups are copy-on-write. We can set them safely - * without another task interfering. + * Old cruft */ - -asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) +SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { - struct group_info *group_info; - int retval; - - if (!capable(CAP_SETGID)) - return -EPERM; - if ((unsigned)gidsetsize > NGROUPS_MAX) - return -EINVAL; - - group_info = groups_alloc(gidsetsize); - if (!group_info) - return -ENOMEM; - retval = groups_from_user(group_info, grouplist); - if (retval) { - put_group_info(group_info); - return retval; - } + int error = 0; - retval = set_current_groups(group_info); - put_group_info(group_info); + if (!name) + return -EFAULT; - return retval; -} + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof(*name))) + error = -EFAULT; + up_read(&uts_sem); -/* - * Check whether we're fsgid/egid or in the supplemental group.. - */ -int in_group_p(gid_t grp) -{ - int retval = 1; - if (grp != current->fsgid) - retval = groups_search(current->group_info, grp); - return retval; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; + if (!error && override_architecture(name)) + error = -EFAULT; + return error; } -EXPORT_SYMBOL(in_group_p); - -int in_egroup_p(gid_t grp) +SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - int retval = 1; - if (grp != current->egid) - retval = groups_search(current->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_egroup_p); - -DECLARE_RWSEM(uts_sem); - -EXPORT_SYMBOL(uts_sem); + int error; -asmlinkage long sys_newuname(struct new_utsname __user * name) -{ - int errno = 0; + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) + return -EFAULT; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); up_read(&uts_sem); - return errno; + + if (!error && override_architecture(name)) + error = -EFAULT; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; + return error ? -EFAULT : 0; } +#endif -asmlinkage long sys_sethostname(char __user *name, int len) +SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; + uts_proc_notify(UTS_PROC_HOSTNAME); } up_write(&uts_sem); return errno; @@ -1360,18 +1218,20 @@ asmlinkage long sys_sethostname(char __user *name, int len) #ifdef __ARCH_WANT_SYS_GETHOSTNAME -asmlinkage long sys_gethostname(char __user *name, int len) +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { int i, errno; + struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); + u = utsname(); + i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) + if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1383,12 +1243,12 @@ asmlinkage long sys_gethostname(char __user *name, int len) * Only setdomainname; getdomainname can be implemented by calling * uname() */ -asmlinkage long sys_setdomainname(char __user *name, int len) +SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1396,25 +1256,27 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; + uts_proc_notify(UTS_PROC_DOMAINNAME); } up_write(&uts_sem); return errno; } -asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { - if (resource >= RLIM_NLIMITS) - return -EINVAL; - else { - struct rlimit value; - task_lock(current->group_leader); - value = current->signal->rlim[resource]; - task_unlock(current->group_leader); - return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - } + struct rlimit value; + int ret; + + ret = do_prlimit(current, resource, NULL, &value); + if (!ret) + ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + + return ret; } #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT @@ -1423,7 +1285,8 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) * Back compatibility for getrlimit. Needed for some apps. */ -asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct rlimit __user *, rlim) { struct rlimit x; if (resource >= RLIM_NLIMITS) @@ -1441,45 +1304,91 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r #endif -asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) +static inline bool rlim64_is_infinity(__u64 rlim64) { - struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; - int retval; +#if BITS_PER_LONG < 64 + return rlim64 >= ULONG_MAX; +#else + return rlim64 == RLIM64_INFINITY; +#endif +} + +static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) +{ + if (rlim->rlim_cur == RLIM_INFINITY) + rlim64->rlim_cur = RLIM64_INFINITY; + else + rlim64->rlim_cur = rlim->rlim_cur; + if (rlim->rlim_max == RLIM_INFINITY) + rlim64->rlim_max = RLIM64_INFINITY; + else + rlim64->rlim_max = rlim->rlim_max; +} + +static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) +{ + if (rlim64_is_infinity(rlim64->rlim_cur)) + rlim->rlim_cur = RLIM_INFINITY; + else + rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; + if (rlim64_is_infinity(rlim64->rlim_max)) + rlim->rlim_max = RLIM_INFINITY; + else + rlim->rlim_max = (unsigned long)rlim64->rlim_max; +} + +/* make sure you are allowed to change @tsk limits before calling this */ +int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) +{ + struct rlimit *rlim; + int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; - old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && - !capable(CAP_SYS_RESOURCE)) - return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; - - retval = security_task_setrlimit(resource, &new_rlim); - if (retval) - return retval; - - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim.rlim_cur = 1; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; } - task_lock(current->group_leader); - *old_rlim = new_rlim; - task_unlock(current->group_leader); - - if (resource != RLIMIT_CPU) + /* protect tsk->signal and tsk->sighand from disappearing */ + read_lock(&tasklist_lock); + if (!tsk->sighand) { + retval = -ESRCH; goto out; + } + + rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, + resource, new_rlim); + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim->rlim_cur = 1; + } + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; + } + task_unlock(tsk->group_leader); /* * RLIMIT_CPU handling. Note that the kernel fails to return an error @@ -1487,23 +1396,85 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * very long-standing error, and fixing it now risks breakage of * applications, so we live with it */ - if (new_rlim.rlim_cur == RLIM_INFINITY) - goto out; + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY) + update_rlimit_cpu(tsk, new_rlim->rlim_cur); +out: + read_unlock(&tasklist_lock); + return retval; +} + +/* rcu lock must be held */ +static int check_prlimit_permission(struct task_struct *task) +{ + const struct cred *cred = current_cred(), *tcred; - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); + if (current == task) + return 0; + + tcred = __task_cred(task); + if (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)) + return 0; + if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; +} + +SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, + const struct rlimit64 __user *, new_rlim, + struct rlimit64 __user *, old_rlim) +{ + struct rlimit64 old64, new64; + struct rlimit old, new; + struct task_struct *tsk; + int ret; + + if (new_rlim) { + if (copy_from_user(&new64, new_rlim, sizeof(new64))) + return -EFAULT; + rlim64_to_rlim(&new64, &new); } -out: - return 0; + + rcu_read_lock(); + tsk = pid ? find_task_by_vpid(pid) : current; + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + ret = check_prlimit_permission(tsk); + if (ret) { + rcu_read_unlock(); + return ret; + } + get_task_struct(tsk); + rcu_read_unlock(); + + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + + if (!ret && old_rlim) { + rlim_to_rlim64(&old, &old64); + if (copy_to_user(old_rlim, &old64, sizeof(old64))) + ret = -EFAULT; + } + + put_task_struct(tsk); + return ret; +} + +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim; + + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + return do_prlimit(current, resource, &new_rlim, NULL); } /* @@ -1539,21 +1510,36 @@ out: * */ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) +{ + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); +} + static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - cputime_t utime, stime; + cputime_t tgutime, tgstime, utime, stime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); - utime = stime = cputime_zero; + utime = stime = 0; - rcu_read_lock(); - if (!lock_task_sighand(p, &flags)) { - rcu_read_unlock(); - return; + if (who == RUSAGE_THREAD) { + task_cputime_adjusted(current, &utime, &stime); + accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; + goto out; } + if (!lock_task_sighand(p, &flags)) + return; + switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: @@ -1565,42 +1551,46 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - r->ru_nvcsw += t->nvcsw; - r->ru_nivcsw += t->nivcsw; - r->ru_minflt += t->min_flt; - r->ru_majflt += t->maj_flt; - r->ru_inblock += task_io_get_inblock(t); - r->ru_oublock += task_io_get_oublock(t); - t = next_thread(t); - } while (t != p); + accumulate_thread_rusage(t, r); + } while_each_thread(p, t); break; default: BUG(); } - unlock_task_sighand(p, &flags); - rcu_read_unlock(); +out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) @@ -1610,143 +1600,426 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } -asmlinkage long sys_getrusage(int who, struct rusage __user *ru) +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) return -EINVAL; return getrusage(current, who, ru); } -asmlinkage long sys_umask(int mask) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) +{ + struct rusage r; + + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) + return -EINVAL; + + k_getrusage(current, who, &r); + return put_compat_rusage(&r, ru); +} +#endif + +SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); return mask; } -asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5) +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +{ + struct fd exe; + struct inode *inode; + int err; + + exe = fdget(fd); + if (!exe.file) + return -EBADF; + + inode = file_inode(exe.file); + + /* + * Because the original mm->exe_file points to executable file, make + * sure that this one is executable as well, to avoid breaking an + * overall picture. + */ + err = -EACCES; + if (!S_ISREG(inode->i_mode) || + exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + err = inode_permission(inode, MAY_EXEC); + if (err) + goto exit; + + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if old file still mapped. + */ + err = -EBUSY; + if (mm->exe_file) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_file && + path_equal(&vma->vm_file->f_path, + &mm->exe_file->f_path)) + goto exit_unlock; + } + + /* + * The symlink can be changed only once, just to disallow arbitrary + * transitions malicious software might bring in. This means one + * could make a snapshot over all processes running and monitor + * /proc/pid/exe changes to notice unusual activity if needed. + */ + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + err = 0; + set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ +exit_unlock: + up_write(&mm->mmap_sem); + +exit: + fdput(exe); + return err; +} + +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int error; + + if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + return -EINVAL; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); + + if (addr >= TASK_SIZE || addr < mmap_min_addr) + return -EINVAL; + + error = -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + switch (opt) { + case PR_SET_MM_START_CODE: + mm->start_code = addr; + break; + case PR_SET_MM_END_CODE: + mm->end_code = addr; + break; + case PR_SET_MM_START_DATA: + mm->start_data = addr; + break; + case PR_SET_MM_END_DATA: + mm->end_data = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + /* + * If command line arguments and environment + * are placed somewhere else on stack, we can + * set them up here, ARG_START/END to setup + * command line argumets and ENV_START/END + * for environment. + */ + case PR_SET_MM_START_STACK: + case PR_SET_MM_ARG_START: + case PR_SET_MM_ARG_END: + case PR_SET_MM_ENV_START: + case PR_SET_MM_ENV_END: + if (!vma) { + error = -EFAULT; + goto out; + } + if (opt == PR_SET_MM_START_STACK) + mm->start_stack = addr; + else if (opt == PR_SET_MM_ARG_START) + mm->arg_start = addr; + else if (opt == PR_SET_MM_ARG_END) + mm->arg_end = addr; + else if (opt == PR_SET_MM_ENV_START) + mm->env_start = addr; + else if (opt == PR_SET_MM_ENV_END) + mm->env_end = addr; + break; + + /* + * This doesn't move auxiliary vector itself + * since it's pinned to mm_struct, but allow + * to fill vector with new values. It's up + * to a caller to provide sane values here + * otherwise user space tools which use this + * vector might be unhappy. + */ + case PR_SET_MM_AUXV: { + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (arg4 > sizeof(user_auxv)) + goto out; + up_read(&mm->mmap_sem); + + if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, arg4); + task_unlock(current); + + return 0; + } + default: + goto out; + } + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { + return put_user(me->clear_child_tid, tid_addr); +} +#else +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} +#endif + +SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + unsigned long, arg4, unsigned long, arg5) +{ + struct task_struct *me = current; + unsigned char comm[sizeof(me->comm)]; long error; error = security_task_prctl(option, arg2, arg3, arg4, arg5); - if (error) + if (error != -ENOSYS) return error; + error = 0; switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - current->pdeath_signal = arg2; - break; - case PR_GET_PDEATHSIG: - error = put_user(current->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(current->mm); + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; break; - case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { - error = -EINVAL; - break; - } - set_dumpable(current->mm, arg2); + } + me->pdeath_signal = arg2; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { + error = -EINVAL; break; + } + set_dumpable(me->mm, arg2); + break; - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(current, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(current, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(current, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(current, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(current, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(current, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + break; + case PR_SET_NAME: + comm[sizeof(me->comm) - 1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + proc_comm_connector(me); + break; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) + return -EFAULT; + break; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2, (char __user *)arg3); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); + break; + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = + current->default_timer_slack_ns; + else + current->timer_slack_ns = arg2; + break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; break; - case PR_SET_TIMING: - if (arg2 == PR_TIMING_STATISTICAL) - error = 0; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); else - error = -EINVAL; - break; - - case PR_GET_KEEPCAPS: - if (current->keep_capabilities) - error = 1; - break; - case PR_SET_KEEPCAPS: - if (arg2 != 0 && arg2 != 1) { - error = -EINVAL; - break; - } - current->keep_capabilities = arg2; + return -EINVAL; break; - case PR_SET_NAME: { - struct task_struct *me = current; - unsigned char ncomm[sizeof(me->comm)]; - - ncomm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(ncomm, (char __user *)arg2, - sizeof(me->comm)-1) < 0) - return -EFAULT; - set_task_comm(me, ncomm); - return 0; - } - case PR_GET_NAME: { - struct task_struct *me = current; - unsigned char tcomm[sizeof(me->comm)]; - - get_task_comm(tcomm, me); - if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) - return -EFAULT; - return 0; + default: + return -EINVAL; } - case PR_GET_ENDIAN: - error = GET_ENDIAN(current, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(current, arg2); - break; - - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); - break; - - case PR_CAPBSET_READ: - if (!cap_valid(arg2)) - return -EINVAL; - return !!cap_raised(current->cap_bset, arg2); - case PR_CAPBSET_DROP: -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES - return cap_prctl_drop(arg2); -#else + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *)arg2); + break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; -#endif - default: - error = -EINVAL; - break; + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; + default: + error = -EINVAL; + break; } return error; } -asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, - struct getcpu_cache __user *unused) +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); @@ -1757,60 +2030,146 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static void argv_cleanup(char **argv, char **envp) -{ - argv_free(argv); -} - /** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. + * do_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill */ -int orderly_poweroff(bool force) +static int do_sysinfo(struct sysinfo *info) { - int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret = -ENOMEM; - struct subprocess_info *info; - - if (argv == NULL) { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - goto out; - } + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + struct timespec tp; + + memset(info, 0, sizeof(struct sysinfo)); - info = call_usermodehelper_setup(argv[0], argv, envp); - if (info == NULL) { - argv_free(argv); + get_monotonic_boottime(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + + info->procs = nr_threads; + + si_meminfo(info); + si_swapinfo(info); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen <andersee@debian.org> + */ + + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) goto out; + bitcount = 0; + mem_unit = info->mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; } - call_usermodehelper_setcleanup(info, argv_cleanup); + /* + * If mem_total did not overflow, multiply all memory values by + * info->mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; - out: - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); +out: + return 0; +} + +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) +{ + struct sysinfo val; + + do_sysinfo(&val); - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ - emergency_sync(); - kernel_power_off(); + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_sysinfo { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + u16 procs; + u16 pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) +{ + struct sysinfo s; + + do_sysinfo(&s); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + int bitcount = 0; + + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; } - return ret; + if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + __put_user(s.uptime, &info->uptime) || + __put_user(s.loads[0], &info->loads[0]) || + __put_user(s.loads[1], &info->loads[1]) || + __put_user(s.loads[2], &info->loads[2]) || + __put_user(s.totalram, &info->totalram) || + __put_user(s.freeram, &info->freeram) || + __put_user(s.sharedram, &info->sharedram) || + __put_user(s.bufferram, &info->bufferram) || + __put_user(s.totalswap, &info->totalswap) || + __put_user(s.freeswap, &info->freeswap) || + __put_user(s.procs, &info->procs) || + __put_user(s.totalhigh, &info->totalhigh) || + __put_user(s.freehigh, &info->freehigh) || + __put_user(s.mem_unit, &info->mem_unit)) + return -EFAULT; + + return 0; } -EXPORT_SYMBOL_GPL(orderly_poweroff); +#endif /* CONFIG_COMPAT */ |
