diff options
Diffstat (limited to 'kernel/pid_namespace.c')
| -rw-r--r-- | kernel/pid_namespace.c | 239 | 
1 files changed, 218 insertions, 21 deletions
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0..db95d8eb761 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -10,12 +10,14 @@  #include <linux/pid.h>  #include <linux/pid_namespace.h> +#include <linux/user_namespace.h>  #include <linux/syscalls.h>  #include <linux/err.h>  #include <linux/acct.h>  #include <linux/slab.h> - -#define BITS_PER_PAGE		(PAGE_SIZE*8) +#include <linux/proc_ns.h> +#include <linux/reboot.h> +#include <linux/export.h>  struct pid_cache {  	int nr_ids; @@ -68,12 +70,29 @@ err_alloc:  	return NULL;  } -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +static void proc_cleanup_work(struct work_struct *work) +{ +	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); +	pid_ns_release_proc(ns); +} + +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + +static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, +	struct pid_namespace *parent_pid_ns)  {  	struct pid_namespace *ns;  	unsigned int level = parent_pid_ns->level + 1;  	int i; +	int err; +	if (level > MAX_PID_NS_LEVEL) { +		err = -EINVAL; +		goto out; +	} + +	err = -ENOMEM;  	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);  	if (ns == NULL)  		goto out; @@ -86,9 +105,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p  	if (ns->pid_cachep == NULL)  		goto out_free_map; +	err = proc_alloc_inum(&ns->proc_inum); +	if (err) +		goto out_free_map; +  	kref_init(&ns->kref);  	ns->level = level;  	ns->parent = get_pid_ns(parent_pid_ns); +	ns->user_ns = get_user_ns(user_ns); +	ns->nr_hashed = PIDNS_HASH_ADDING; +	INIT_WORK(&ns->proc_work, proc_cleanup_work);  	set_bit(0, ns->pidmap[0].page);  	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -103,45 +129,71 @@ out_free_map:  out_free:  	kmem_cache_free(pid_ns_cachep, ns);  out: -	return ERR_PTR(-ENOMEM); +	return ERR_PTR(err); +} + +static void delayed_free_pidns(struct rcu_head *p) +{ +	kmem_cache_free(pid_ns_cachep, +			container_of(p, struct pid_namespace, rcu));  }  static void destroy_pid_namespace(struct pid_namespace *ns)  {  	int i; +	proc_free_inum(ns->proc_inum);  	for (i = 0; i < PIDMAP_ENTRIES; i++)  		kfree(ns->pidmap[i].page); -	kmem_cache_free(pid_ns_cachep, ns); +	put_user_ns(ns->user_ns); +	call_rcu(&ns->rcu, delayed_free_pidns);  } -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, +	struct user_namespace *user_ns, struct pid_namespace *old_ns)  {  	if (!(flags & CLONE_NEWPID))  		return get_pid_ns(old_ns); -	if (flags & (CLONE_THREAD|CLONE_PARENT)) +	if (task_active_pid_ns(current) != old_ns)  		return ERR_PTR(-EINVAL); -	return create_pid_namespace(old_ns); +	return create_pid_namespace(user_ns, old_ns);  } -void free_pid_ns(struct kref *kref) +static void free_pid_ns(struct kref *kref)  { -	struct pid_namespace *ns, *parent; +	struct pid_namespace *ns;  	ns = container_of(kref, struct pid_namespace, kref); - -	parent = ns->parent;  	destroy_pid_namespace(ns); +} + +void put_pid_ns(struct pid_namespace *ns) +{ +	struct pid_namespace *parent; -	if (parent != NULL) -		put_pid_ns(parent); +	while (ns != &init_pid_ns) { +		parent = ns->parent; +		if (!kref_put(&ns->kref, free_pid_ns)) +			break; +		ns = parent; +	}  } +EXPORT_SYMBOL_GPL(put_pid_ns);  void zap_pid_ns_processes(struct pid_namespace *pid_ns)  {  	int nr;  	int rc; -	struct task_struct *task; +	struct task_struct *task, *me = current; +	int init_pids = thread_group_leader(me) ? 1 : 2; + +	/* Don't allow any more processes into the pid namespace */ +	disable_pid_allocation(pid_ns); + +	/* Ignore SIGCHLD causing any terminated children to autoreap */ +	spin_lock_irq(&me->sighand->siglock); +	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; +	spin_unlock_irq(&me->sighand->siglock);  	/*  	 * The last thread in the cgroup-init thread group is terminating. @@ -161,13 +213,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	while (nr > 0) {  		rcu_read_lock(); -		/* -		 * Any nested-container's init processes won't ignore the -		 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). -		 */  		task = pid_task(find_vpid(nr), PIDTYPE_PID); -		if (task) -			send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); +		if (task && !__fatal_signal_pending(task)) +			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);  		rcu_read_unlock(); @@ -175,18 +223,167 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	}  	read_unlock(&tasklist_lock); +	/* Firstly reap the EXIT_ZOMBIE children we may have. */  	do {  		clear_thread_flag(TIF_SIGPENDING);  		rc = sys_wait4(-1, NULL, __WALL, NULL);  	} while (rc != -ECHILD); +	/* +	 * sys_wait4() above can't reap the TASK_DEAD children. +	 * Make sure they all go away, see free_pid(). +	 */ +	for (;;) { +		set_current_state(TASK_UNINTERRUPTIBLE); +		if (pid_ns->nr_hashed == init_pids) +			break; +		schedule(); +	} +	__set_current_state(TASK_RUNNING); + +	if (pid_ns->reboot) +		current->signal->group_exit_code = pid_ns->reboot; +  	acct_exit_ns(pid_ns);  	return;  } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int pid_ns_ctl_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct pid_namespace *pid_ns = task_active_pid_ns(current); +	struct ctl_table tmp = *table; + +	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) +		return -EPERM; + +	/* +	 * Writing directly to ns' last_pid field is OK, since this field +	 * is volatile in a living namespace anyway and a code writing to +	 * it should synchronize its usage with external means. +	 */ + +	tmp.data = &pid_ns->last_pid; +	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +} + +extern int pid_max; +static int zero = 0; +static struct ctl_table pid_ns_ctl_table[] = { +	{ +		.procname = "ns_last_pid", +		.maxlen = sizeof(int), +		.mode = 0666, /* permissions are checked in the handler */ +		.proc_handler = pid_ns_ctl_handler, +		.extra1 = &zero, +		.extra2 = &pid_max, +	}, +	{ } +}; +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +#endif	/* CONFIG_CHECKPOINT_RESTORE */ + +int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ +	if (pid_ns == &init_pid_ns) +		return 0; + +	switch (cmd) { +	case LINUX_REBOOT_CMD_RESTART2: +	case LINUX_REBOOT_CMD_RESTART: +		pid_ns->reboot = SIGHUP; +		break; + +	case LINUX_REBOOT_CMD_POWER_OFF: +	case LINUX_REBOOT_CMD_HALT: +		pid_ns->reboot = SIGINT; +		break; +	default: +		return -EINVAL; +	} + +	read_lock(&tasklist_lock); +	force_sig(SIGKILL, pid_ns->child_reaper); +	read_unlock(&tasklist_lock); + +	do_exit(0); + +	/* Not reached */ +	return 0; +} + +static void *pidns_get(struct task_struct *task) +{ +	struct pid_namespace *ns; + +	rcu_read_lock(); +	ns = task_active_pid_ns(task); +	if (ns) +		get_pid_ns(ns); +	rcu_read_unlock(); + +	return ns; +} + +static void pidns_put(void *ns) +{ +	put_pid_ns(ns); +} + +static int pidns_install(struct nsproxy *nsproxy, void *ns) +{ +	struct pid_namespace *active = task_active_pid_ns(current); +	struct pid_namespace *ancestor, *new = ns; + +	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || +	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) +		return -EPERM; + +	/* +	 * Only allow entering the current active pid namespace +	 * or a child of the current active pid namespace. +	 * +	 * This is required for fork to return a usable pid value and +	 * this maintains the property that processes and their +	 * children can not escape their current pid namespace. +	 */ +	if (new->level < active->level) +		return -EINVAL; + +	ancestor = new; +	while (ancestor->level > active->level) +		ancestor = ancestor->parent; +	if (ancestor != active) +		return -EINVAL; + +	put_pid_ns(nsproxy->pid_ns_for_children); +	nsproxy->pid_ns_for_children = get_pid_ns(new); +	return 0; +} + +static unsigned int pidns_inum(void *ns) +{ +	struct pid_namespace *pid_ns = ns; +	return pid_ns->proc_inum; +} + +const struct proc_ns_operations pidns_operations = { +	.name		= "pid", +	.type		= CLONE_NEWPID, +	.get		= pidns_get, +	.put		= pidns_put, +	.install	= pidns_install, +	.inum		= pidns_inum, +}; +  static __init int pid_namespaces_init(void)  {  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + +#ifdef CONFIG_CHECKPOINT_RESTORE +	register_sysctl_paths(kern_path, pid_ns_ctl_table); +#endif  	return 0;  }  | 
