From 8382fcac1b813ad0a4e68a838fc7ae93fa39eda0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Dec 2012 19:26:06 -0800 Subject: pidns: Outlaw thread creation after unshare(CLONE_NEWPID) The sequence: unshare(CLONE_NEWPID) clone(CLONE_THREAD|CLONE_SIGHAND|CLONE_VM) Creates a new process in the new pid namespace without setting pid_ns->child_reaper. After forking this results in a NULL pointer dereference. Avoid this and other nonsense scenarios that can show up after creating a new pid namespace with unshare by adding a new check in copy_prodcess. Pointed-out-by: Oleg Nesterov Acked-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a31b823b3c2..65ca6d27f24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1166,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); + /* + * If the new process will be in a different pid namespace + * don't allow the creation of threads. + */ + if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && + (task_active_pid_ns(current) != current->nsproxy->pid_ns)) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; -- cgit v1.2.3-70-g09d2 From c876ad7682155958d0c9c27afe9017925c230d64 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Dec 2012 20:27:12 -0800 Subject: pidns: Stop pid allocation when init dies Oleg pointed out that in a pid namespace the sequence. - pid 1 becomes a zombie - setns(thepidns), fork,... - reaping pid 1. - The injected processes exiting. Can lead to processes attempting access their child reaper and instead following a stale pointer. That waitpid for init can return before all of the processes in the pid namespace have exited is also unfortunate. Avoid these problems by disabling the allocation of new pids in a pid namespace when init dies, instead of when the last process in a pid namespace is reaped. Pointed-out-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- include/linux/pid.h | 1 + include/linux/pid_namespace.h | 4 +++- kernel/pid.c | 15 ++++++++++++--- kernel/pid_namespace.c | 4 ++++ 4 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid.h b/include/linux/pid.h index b152d44fb18..2381c973d89 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -121,6 +121,7 @@ int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); extern struct pid *alloc_pid(struct pid_namespace *ns); extern void free_pid(struct pid *pid); +extern void disable_pid_allocation(struct pid_namespace *ns); /* * ns_of_pid() returns the pid namespace in which the specified pid was diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index bf285999273..215e5e3dda1 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,7 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; - int nr_hashed; + unsigned int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -42,6 +42,8 @@ struct pid_namespace { extern struct pid_namespace init_pid_ns; +#define PIDNS_HASH_ADDING (1U << 31) + #ifdef CONFIG_PID_NS static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { diff --git a/kernel/pid.c b/kernel/pid.c index 36aa02ff17d..de9af600006 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -270,7 +270,6 @@ void free_pid(struct pid *pid) wake_up_process(ns->child_reaper); break; case 0: - ns->nr_hashed = -1; schedule_work(&ns->proc_work); break; } @@ -319,7 +318,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (ns->nr_hashed < 0) + if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, @@ -342,6 +341,13 @@ out_free: goto out; } +void disable_pid_allocation(struct pid_namespace *ns) +{ + spin_lock_irq(&pidmap_lock); + ns->nr_hashed &= ~PIDNS_HASH_ADDING; + spin_unlock_irq(&pidmap_lock); +} + struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { struct hlist_node *elem; @@ -573,6 +579,9 @@ void __init pidhash_init(void) void __init pidmap_init(void) { + /* Veryify no one has done anything silly */ + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); @@ -584,7 +593,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - init_pid_ns.nr_hashed = 1; + init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index fdbd0cdf271..c1c3dc1c602 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -115,6 +115,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); @@ -181,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; + /* Don't allow any more processes into the pid namespace */ + disable_pid_allocation(pid_ns); + /* Ignore SIGCHLD causing any terminated children to autoreap */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; -- cgit v1.2.3-70-g09d2 From 35dac27cedd14c3b6fcd4ba7bc3c31738cfd1831 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 4 Jan 2013 15:35:50 -0800 Subject: printk: fix incorrect length from print_time() when seconds > 99999 print_prefix() passes a NULL buf to print_time() to get the length of the time prefix; when printk times are enabled, the current code just returns the constant 15, which matches the format "[%5lu.%06lu] " used to print the time value. However, this is obviously incorrect when the whole seconds part of the time gets beyond 5 digits (100000 seconds is a bit more than a day of uptime). The simple fix is to use snprintf(NULL, 0, ...) to calculate the actual length of the time prefix. This could be micro-optimized but it seems better to have simpler, more readable code here. The bug leads to the syslog system call miscomputing which messages fit into the userspace buffer. If there are enough messages to fill log_buf_len and some have a timestamp >= 100000, dmesg may fail with: # dmesg klogctl: Bad address When this happens, strace shows that the failure is indeed EFAULT due to the kernel mistakenly accessing past the end of dmesg's buffer, since dmesg asks the kernel how big a buffer it needs, allocates a bit more, and then gets an error when it asks the kernel to fill it: syslog(0xa, 0, 0) = 1048576 mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4d25d2000 syslog(0x3, 0x7fa4d25d2010, 0x100008) = -1 EFAULT (Bad address) As far as I can see, the bug has been there as long as print_time(), which comes from commit 084681d14e42 ("printk: flush continuation lines immediately to console") in 3.5-rc5. Signed-off-by: Roland Dreier Signed-off-by: Greg Kroah-Hartman Cc: Joe Perches Cc: Sylvain Munaut Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 19c0d7bcf24..357f714ddd4 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -870,10 +870,11 @@ static size_t print_time(u64 ts, char *buf) if (!printk_time) return 0; + rem_nsec = do_div(ts, 1000000000); + if (!buf) - return 15; + return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); - rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); } -- cgit v1.2.3-70-g09d2 From 5ba53ff648e785445a32ba39112ed07e4cf588d0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Jan 2013 19:13:13 +0100 Subject: signals: sys_ssetmask() uses uninitialized newmask Commit 77097ae503b1 ("most of set_current_blocked() callers want SIGKILL/SIGSTOP removed from set") removed the initialization of newmask by accident, causing ltp to complain like this: ssetmask01 1 TFAIL : sgetmask() failed: TEST_ERRNO=???(0): Success Restore the proper initialization. Reported-and-tested-by: CAI Qian Signed-off-by: Oleg Nesterov Cc: stable@kernel.org # v3.5+ Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7aaa51d8e5b..9692499c77c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3286,6 +3286,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) int old = current->blocked.sig[0]; sigset_t newset; + siginitset(&newset, newmask); set_current_blocked(&newset); return old; -- cgit v1.2.3-70-g09d2 From 0c4a842349b27d361f1503f6437df303e1b541c9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Jan 2013 19:13:29 +0100 Subject: signals: set_current_blocked() can use __set_current_blocked() Cleanup. And I think we need more cleanups, in particular __set_current_blocked() and sigprocmask() should die. Nobody should ever block SIGKILL or SIGSTOP. - Change set_current_blocked() to use __set_current_blocked() - Change sys_sigprocmask() to use set_current_blocked(), this way it should not worry about SIGKILL/SIGSTOP. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9692499c77c..372771e948c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2528,11 +2528,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) */ void set_current_blocked(sigset_t *newset) { - struct task_struct *tsk = current; sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); - spin_lock_irq(&tsk->sighand->siglock); - __set_task_blocked(tsk, newset); - spin_unlock_irq(&tsk->sighand->siglock); + __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) @@ -3204,7 +3201,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; - new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); new_blocked = current->blocked; @@ -3222,7 +3218,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EINVAL; } - __set_current_blocked(&new_blocked); + set_current_blocked(&new_blocked); } if (oset) { -- cgit v1.2.3-70-g09d2