aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c69
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/pid.c47
-rw-r--r--kernel/pid_namespace.c112
-rw-r--r--kernel/ptrace.c10
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c147
-rw-r--r--kernel/utsname.c33
14 files changed, 371 insertions, 115 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 571264830a2..4855892798f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3408,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
+ struct pid_namespace *ns = task_active_pid_ns(current);
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9ff5493171..301079d06f2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->parent = parent_event;
- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
diff --git a/kernel/exit.c b/kernel/exit.c
index 50d2e93c36e..b4df2193721 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
- /*
- * If we are the last child process in a pid namespace to be
- * reaped, notify the reaper sleeping zap_pid_ns_processes().
- */
- if (IS_ENABLED(CONFIG_PID_NS)) {
- struct task_struct *parent = p->real_parent;
-
- if ((task_active_pid_ns(parent)->child_reaper == parent) &&
- list_empty(&parent->children) &&
- (parent->flags & PF_EXITING))
- wake_up_process(parent);
- }
}
list_del_rcu(&p->thread_group);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 115d6c2e4cc..c36c4e301ef 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
init_waitqueue_head(&sig->wait_chldexit);
- if (clone_flags & CLONE_NEWPID)
- sig->flags |= SIGNAL_UNKILLABLE;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
@@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
if (thread_group_leader(p)) {
- if (is_child_reaper(pid))
- p->nsproxy->pid_ns->child_reaper = p;
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1473,8 +1473,6 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- if (unlikely(clone_flags & CLONE_NEWPID))
- pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
@@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
- if (clone_flags & CLONE_NEWUSER) {
- if (clone_flags & CLONE_THREAD)
+ if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+ if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
return -EINVAL;
- /* hopefully this check will go away when userns support is
- * complete
- */
- if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- !capable(CAP_SETGID))
- return -EPERM;
}
/*
@@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
{
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
@@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
- err = check_unshare_flags(unshare_flags);
- if (err)
- goto bad_unshare_out;
-
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD;
+ /*
+ * If unsharing a pid namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWPID)
+ unshare_flags |= CLONE_THREAD;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto bad_unshare_out;
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
- err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+ err = unshare_userns(unshare_flags, &new_cred);
if (err)
goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
- if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
task_unlock(current);
+
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
}
if (new_nsproxy)
put_nsproxy(new_nsproxy);
+bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 7e1c3de1ce4..78e2ecb2016 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
* leave it to the caller to do proper locking and attach it to task.
*/
static struct nsproxy *create_new_namespaces(unsigned long flags,
- struct task_struct *tsk, struct fs_struct *new_fs)
+ struct task_struct *tsk, struct user_namespace *user_ns,
+ struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
if (!new_nsp)
return ERR_PTR(-ENOMEM);
- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
}
- new_nsp->uts_ns = copy_utsname(flags, tsk);
+ new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
- new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+ new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
}
- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+ new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
if (IS_ERR(new_nsp->pid_ns)) {
err = PTR_ERR(new_nsp->pid_ns);
goto out_pid;
}
- new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
+ new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
@@ -122,6 +123,7 @@ out_ns:
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
+ struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
int err = 0;
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
err = -EPERM;
goto out;
}
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
goto out;
}
- new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+ new_ns = create_new_namespaces(flags, tsk,
+ task_cred_xxx(tsk, user_ns), tsk->fs);
if (IS_ERR(new_ns)) {
err = PTR_ERR(new_ns);
goto out;
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns)
* On success, returns the new nsproxy.
*/
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
- struct nsproxy **new_nsp, struct fs_struct *new_fs)
+ struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
+ struct user_namespace *user_ns;
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET)))
+ CLONE_NEWNET | CLONE_NEWPID)))
return 0;
- if (!capable(CAP_SYS_ADMIN))
+ user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = create_new_namespaces(unshare_flags, current,
- new_fs ? new_fs : current->fs);
+ *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
goto out;
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
struct file *file;
int err;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
file = proc_ns_fget(fd);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
if (nstype && (ops->type != nstype))
goto out;
- new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new_nsproxy)) {
err = PTR_ERR(new_nsproxy);
goto out;
diff --git a/kernel/pid.c b/kernel/pid.c
index a54a1123c7c..36aa02ff17d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,6 +79,8 @@ struct pid_namespace init_pid_ns = {
.last_pid = 0,
.level = 0,
.child_reaper = &init_task,
+ .user_ns = &init_user_ns,
+ .proc_inum = PROC_PID_INIT_INO,
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -254,8 +257,24 @@ void free_pid(struct pid *pid)
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- hlist_del_rcu(&pid->numbers[i].pid_chain);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+ hlist_del_rcu(&upid->pid_chain);
+ switch(--ns->nr_hashed) {
+ case 1:
+ /* When all that is left in the pid namespace
+ * is the reaper wake up the reaper. The reaper
+ * may be sleeping in zap_pid_ns_processes().
+ */
+ wake_up_process(ns->child_reaper);
+ break;
+ case 0:
+ ns->nr_hashed = -1;
+ schedule_work(&ns->proc_work);
+ break;
+ }
+ }
spin_unlock_irqrestore(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
@@ -277,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
goto out;
tmp = ns;
+ pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
@@ -287,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = tmp->parent;
}
+ if (unlikely(is_child_reaper(pid))) {
+ if (pid_ns_prepare_proc(ns))
+ goto out_free;
+ }
+
get_pid_ns(ns);
- pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- for ( ; upid >= pid->numbers; --upid)
+ if (ns->nr_hashed < 0)
+ goto out_unlock;
+ for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ upid->ns->nr_hashed++;
+ }
spin_unlock_irq(&pidmap_lock);
out:
return pid;
+out_unlock:
+ spin_unlock(&pidmap_lock);
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
@@ -329,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
- return find_pid_ns(nr, current->nsproxy->pid_ns);
+ return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
@@ -413,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -468,7 +498,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
pid_t pid_vnr(struct pid *pid)
{
- return pid_nr_ns(pid, current->nsproxy->pid_ns);
+ return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
@@ -479,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
@@ -554,6 +584,7 @@ void __init pidmap_init(void)
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ init_pid_ns.nr_hashed = 1;
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb7..560da0dab23 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
return NULL;
}
+static void proc_cleanup_work(struct work_struct *work)
+{
+ struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+ pid_ns_release_proc(ns);
+}
+
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+ struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
if (ns->pid_cachep == NULL)
goto out_free_map;
+ err = proc_alloc_inum(&ns->proc_inum);
+ if (err)
+ goto out_free_map;
+
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
+ ns->user_ns = get_user_ns(user_ns);
+ INIT_WORK(&ns->proc_work, proc_cleanup_work);
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
- err = pid_ns_prepare_proc(ns);
- if (err)
- goto out_put_parent_pid_ns;
-
return ns;
-out_put_parent_pid_ns:
- put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
@@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
+ proc_free_inum(ns->proc_inum);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
+ put_user_ns(ns->user_ns);
kmem_cache_free(pid_ns_cachep, ns);
}
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns);
- if (flags & (CLONE_THREAD|CLONE_PARENT))
+ if (task_active_pid_ns(current) != old_ns)
return ERR_PTR(-EINVAL);
- return create_pid_namespace(old_ns);
+ return create_pid_namespace(user_ns, old_ns);
}
static void free_pid_ns(struct kref *kref)
@@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/*
* sys_wait4() above can't reap the TASK_DEAD children.
- * Make sure they all go away, see __unhash_process().
+ * Make sure they all go away, see free_pid().
*/
for (;;) {
- bool need_wait = false;
-
- read_lock(&tasklist_lock);
- if (!list_empty(&current->children)) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
- need_wait = true;
- }
- read_unlock(&tasklist_lock);
-
- if (!need_wait)
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (pid_ns->nr_hashed == 1)
break;
schedule();
}
+ __set_current_state(TASK_RUNNING);
if (pid_ns->reboot)
current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
- if (write && !capable(CAP_SYS_ADMIN))
+ if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &current->nsproxy->pid_ns->last_pid;
+ tmp.data = &pid_ns->last_pid;
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
@@ -299,6 +304,67 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
+static void *pidns_get(struct task_struct *task)
+{
+ struct pid_namespace *ns;
+
+ rcu_read_lock();
+ ns = get_pid_ns(task_active_pid_ns(task));
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void pidns_put(void *ns)
+{
+ put_pid_ns(ns);
+}
+
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *ancestor, *new = ns;
+
+ if (!ns_capable(new->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Only allow entering the current active pid namespace
+ * or a child of the current active pid namespace.
+ *
+ * This is required for fork to return a usable pid value and
+ * this maintains the property that processes and their
+ * children can not escape their current pid namespace.
+ */
+ if (new->level < active->level)
+ return -EINVAL;
+
+ ancestor = new;
+ while (ancestor->level > active->level)
+ ancestor = ancestor->parent;
+ if (ancestor != active)
+ return -EINVAL;
+
+ put_pid_ns(nsproxy->pid_ns);
+ nsproxy->pid_ns = get_pid_ns(new);
+ return 0;
+}
+
+static unsigned int pidns_inum(void *ns)
+{
+ struct pid_namespace *pid_ns = ns;
+ return pid_ns->proc_inum;
+}
+
+const struct proc_ns_operations pidns_operations = {
+ .name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .inum = pidns_inum,
+};
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ec8118ab2a4..1599157336a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -215,8 +215,12 @@ ok:
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
+ rcu_read_lock();
+ if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ rcu_read_unlock();
return -EPERM;
+ }
+ rcu_read_unlock();
return security_ptrace_access_check(task, mode);
}
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize)
flags |= PT_SEIZED;
- if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+ rcu_read_lock();
+ if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
flags |= PT_PTRACE_CAP;
+ rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1fb82104bf..257002c13bb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- goto out_unlock;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
retval = security_task_setscheduler(p);
if (retval)
diff --git a/kernel/signal.c b/kernel/signal.c
index a49c7f36ceb..580a91e6347 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
* see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
rcu_read_unlock();
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4..5a638445050 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
goto out_putname;
}
- mnt = current->nsproxy->pid_ns->proc_mnt;
+ mnt = task_active_pid_ns(current)->proc_mnt;
file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
result = PTR_ERR(file);
if (IS_ERR(file))
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9e..33acb5e53a5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
/*
* userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
},
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
+ .proc_inum = PROC_USER_INIT_INO,
};
EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba3..f5975ccf934 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+ /* Start with the same capabilities as init but useless for doing
+ * anything as the capabilities are bound to the new user namespace.
+ */
+ cred->securebits = SECUREBITS_DEFAULT;
+ cred->cap_inheritable = CAP_EMPTY_SET;
+ cred->cap_permitted = CAP_FULL_SET;
+ cred->cap_effective = CAP_FULL_SET;
+ cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+ key_put(cred->request_key_auth);
+ cred->request_key_auth = NULL;
+#endif
+ /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+ cred->user_ns = user_ns;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid;
kgid_t group = new->egid;
+ int ret;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
@@ -52,38 +72,45 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
+ ret = proc_alloc_inum(&ns->proc_inum);
+ if (ret) {
+ kmem_cache_free(user_ns_cachep, ns);
+ return ret;
+ }
+
kref_init(&ns->kref);
+ /* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->owner = owner;
ns->group = group;
- /* Start with the same capabilities as init but useless for doing
- * anything as the capabilities are bound to the new user namespace.
- */
- new->securebits = SECUREBITS_DEFAULT;
- new->cap_inheritable = CAP_EMPTY_SET;
- new->cap_permitted = CAP_FULL_SET;
- new->cap_effective = CAP_FULL_SET;
- new->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
- key_put(new->request_key_auth);
- new->request_key_auth = NULL;
-#endif
- /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-
- /* Leave the new->user_ns reference with the new user namespace. */
- /* Leave the reference to our user_ns with the new cred. */
- new->user_ns = ns;
+ set_cred_user_ns(new, ns);
return 0;
}
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+{
+ struct cred *cred;
+
+ if (!(unshare_flags & CLONE_NEWUSER))
+ return 0;
+
+ cred = prepare_creds();
+ if (!cred)
+ return -ENOMEM;
+
+ *new_cred = cred;
+ return create_user_ns(cred);
+}
+
void free_user_ns(struct kref *kref)
{
struct user_namespace *parent, *ns =
container_of(kref, struct user_namespace, kref);
parent = ns->parent;
+ proc_free_inum(ns->proc_inum);
kmem_cache_free(user_ns_cachep, ns);
put_user_ns(parent);
}
@@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
struct user_namespace *lower_ns;
uid_t lower;
- lower_ns = current_user_ns();
+ lower_ns = seq_user_ns(seq);
if ((lower_ns == ns) && lower_ns->parent)
lower_ns = lower_ns->parent;
@@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
struct user_namespace *lower_ns;
gid_t lower;
- lower_ns = current_user_ns();
+ lower_ns = seq_user_ns(seq);
if ((lower_ns == ns) && lower_ns->parent)
lower_ns = lower_ns->parent;
@@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
if (!ns->parent)
return -EPERM;
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
return map_write(file, buf, size, ppos, CAP_SETUID,
&ns->uid_map, &ns->parent->uid_map);
}
@@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
if (!ns->parent)
return -EPERM;
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
return map_write(file, buf, size, ppos, CAP_SETGID,
&ns->gid_map, &ns->parent->gid_map);
}
@@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
+ /* Allow mapping to your own filesystem ids */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ u32 id = new_map->extent[0].lower_first;
+ if (cap_setid == CAP_SETUID) {
+ kuid_t uid = make_kuid(ns->parent, id);
+ if (uid_eq(uid, current_fsuid()))
+ return true;
+ }
+ else if (cap_setid == CAP_SETGID) {
+ kgid_t gid = make_kgid(ns->parent, id);
+ if (gid_eq(gid, current_fsgid()))
+ return true;
+ }
+ }
+
/* Allow anyone to set a mapping that doesn't require privilege */
if (!cap_valid(cap_setid))
return true;
@@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
return false;
}
+static void *userns_get(struct task_struct *task)
+{
+ struct user_namespace *user_ns;
+
+ rcu_read_lock();
+ user_ns = get_user_ns(__task_cred(task)->user_ns);
+ rcu_read_unlock();
+
+ return user_ns;
+}
+
+static void userns_put(void *ns)
+{
+ put_user_ns(ns);
+}
+
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct user_namespace *user_ns = ns;
+ struct cred *cred;
+
+ /* Don't allow gaining capabilities by reentering
+ * the same user namespace.
+ */
+ if (user_ns == current_user_ns())
+ return -EINVAL;
+
+ /* Threaded many not enter a different user namespace */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cred = prepare_creds();
+ if (!cred)
+ return -ENOMEM;
+
+ put_user_ns(cred->user_ns);
+ set_cred_user_ns(cred, get_user_ns(user_ns));
+
+ return commit_creds(cred);
+}
+
+static unsigned int userns_inum(void *ns)
+{
+