From a7dc19b8652c862d5b7c4d2339bd3c428bd29c4a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 7 Mar 2013 15:09:24 +0000
Subject: clockevents: Don't allow dummy broadcast timers

Currently tick_check_broadcast_device doesn't reject clock_event_devices
with CLOCK_EVT_FEAT_DUMMY, and may select them in preference to real
hardware if they have a higher rating value. In this situation, the
dummy timer is responsible for broadcasting to itself, and the core
clockevents code may attempt to call non-existent callbacks for
programming the dummy, eventually leading to a panic.

This patch makes tick_check_broadcast_device always reject dummy timers,
preventing this problem.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Jon Medhurst (Tixy) <tixy@linaro.org>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8..7f32fe0e52c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
  */
 int tick_check_broadcast_device(struct clock_event_device *dev)
 {
-	if ((tick_broadcast_device.evtdev &&
+	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+	    (tick_broadcast_device.evtdev &&
 	     tick_broadcast_device.evtdev->rating >= dev->rating) ||
 	     (dev->features & CLOCK_EVT_FEAT_C3STOP))
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 751c644b95bb48aaa8825f0c66abbcc184d92051 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Mar 2013 02:27:11 -0700
Subject: pid: Handle the exit of a multi-threaded init.

When a multi-threaded init exits and the initial thread is not the
last thread to exit the initial thread hangs around as a zombie
until the last thread exits.  In that case zap_pid_ns_processes
needs to wait until there are only 2 hashed pids in the pid
namespace not one.

v2. Replace thread_pid_vnr(me) == 1 with the test thread_group_leader(me)
    as suggested by Oleg.

Cc: stable@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Reported-by: Caj Larsson <caj@omnicloud.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid_namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c602..bea15bdf82b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int nr;
 	int rc;
 	struct task_struct *task, *me = current;
+	int init_pids = thread_group_leader(me) ? 1 : 2;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (pid_ns->nr_hashed == 1)
+		if (pid_ns->nr_hashed == init_pids)
 			break;
 		schedule();
 	}
-- 
cgit v1.2.3-70-g09d2


From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Mar 2013 01:45:51 -0700
Subject: userns:  Don't allow creation if the user is chrooted

Guarantee that the policy of which files may be access that is
established by setting the root directory will not be violated
by user namespaces by verifying that the root directory points
to the root of the mount namespace at the time of user namespace
creation.

Changing the root is a privileged operation, and as a matter of policy
it serves to limit unprivileged processes to files below the current
root directory.

For reasons of simplicity and comprehensibility the privilege to
change the root directory is gated solely on the CAP_SYS_CHROOT
capability in the user namespace.  Therefore when creating a user
namespace we must ensure that the policy of which files may be access
can not be violated by changing the root directory.

Anyone who runs a processes in a chroot and would like to use user
namespace can setup the same view of filesystems with a mount
namespace instead.  With this result that this is not a practical
limitation for using user namespaces.

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c            | 24 ++++++++++++++++++++++++
 include/linux/fs_struct.h |  2 ++
 kernel/user_namespace.c   |  9 +++++++++
 3 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index 50ca17d3cb4..a3035223d42 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt)
 	return check_mnt(real_mount(mnt));
 }
 
+bool current_chrooted(void)
+{
+	/* Does the current process have a non-standard root */
+	struct path ns_root;
+	struct path fs_root;
+	bool chrooted;
+
+	/* Find the namespace root */
+	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
+	ns_root.dentry = ns_root.mnt->mnt_root;
+	path_get(&ns_root);
+	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
+		;
+
+	get_fs_root(current->fs, &fs_root);
+
+	chrooted = !path_equal(&fs_root, &ns_root);
+
+	path_put(&fs_root);
+	path_put(&ns_root);
+
+	return chrooted;
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 729eded4b24..2b93a9a5a1e 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root,
 	spin_unlock(&fs->lock);
 }
 
+extern bool current_chrooted(void);
+
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b14f4d34204..0f1e4288457 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -61,6 +61,15 @@ int create_user_ns(struct cred *new)
 	kgid_t group = new->egid;
 	int ret;
 
+	/*
+	 * Verify that we can not violate the policy of which files
+	 * may be accessed that is specified by the root directory,
+	 * by verifing that the root directory is at the root of the
+	 * mount namespace which allows all files to be accessed.
+	 */
+	if (current_chrooted())
+		return -EPERM;
+
 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
 	 * created a user_namespace.
-- 
cgit v1.2.3-70-g09d2


From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 24 Mar 2013 14:28:27 -0700
Subject: userns: Restrict when proc and sysfs can be mounted

Only allow unprivileged mounts of proc and sysfs if they are already
mounted when the user namespace is created.

proc and sysfs are interesting because they have content that is
per namespace, and so fresh mounts are needed when new namespaces
are created while at the same time proc and sysfs have content that
is shared between every instance.

Respect the policy of who may see the shared content of proc and sysfs
by only allowing new mounts if there was an existing mount at the time
the user namespace was created.

In practice there are only two interesting cases: proc and sysfs are
mounted at their usual places, proc and sysfs are not mounted at all
(some form of mount namespace jail).

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c                 | 21 +++++++++++++++++++++
 fs/proc/root.c                 |  4 ++++
 fs/sysfs/mount.c               |  4 ++++
 include/linux/user_namespace.h |  4 ++++
 kernel/user.c                  |  2 ++
 kernel/user_namespace.c        |  2 ++
 6 files changed, 37 insertions(+)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index 968d4c5eae0..d581e45c0a9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2763,6 +2763,27 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
+void update_mnt_policy(struct user_namespace *userns)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct mount *mnt;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		switch (mnt->mnt.mnt_sb->s_magic) {
+		case SYSFS_MAGIC:
+			userns->may_mount_sysfs = true;
+			break;
+		case PROC_SUPER_MAGIC:
+			userns->may_mount_proc = true;
+			break;
+		}
+		if (userns->may_mount_sysfs && userns->may_mount_proc)
+			break;
+	}
+	up_read(&namespace_sem);
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c6e9fac26ba..9c7fab1d23f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
+#include <linux/user_namespace.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
@@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	} else {
 		ns = task_active_pid_ns(current);
 		options = data;
+
+		if (!current_user_ns()->may_mount_proc)
+			return ERR_PTR(-EPERM);
 	}
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8d924b5ec73..afd83273e6c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
@@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	int error;
 
+	if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs)
+		return ERR_PTR(-EPERM);
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4ce00932493..b6b215f13b4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -26,6 +26,8 @@ struct user_namespace {
 	kuid_t			owner;
 	kgid_t			group;
 	unsigned int		proc_inum;
+	bool			may_mount_sysfs;
+	bool			may_mount_proc;
 };
 
 extern struct user_namespace init_user_ns;
@@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns)
 
 #endif
 
+void update_mnt_policy(struct user_namespace *userns);
+
 #endif /* _LINUX_USER_H */
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03..8e635a18ab5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
 	.proc_inum = PROC_USER_INIT_INO,
+	.may_mount_sysfs = true,
+	.may_mount_proc = true,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0f1e4288457..a54f26f82eb 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -96,6 +96,8 @@ int create_user_ns(struct cred *new)
 
 	set_cred_user_ns(new, ns);
 
+	update_mnt_policy(ns);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001
From: Paul Walmsley <paul@pwsan.com>
Date: Sun, 31 Mar 2013 00:04:40 +0000
Subject: Revert "lockdep: check that no locks held at freeze time"

This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d.

Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time")
causes problems with NFS root filesystems.  The failures were noticed on
OMAP2 and 3 boards during kernel init:

  [ BUG: swapper/0/1 still has locks held! ]
  3.9.0-rc3-00344-ga937536 #1 Not tainted
  -------------------------------------
  1 lock held by swapper/0/1:
   #0:  (&type->s_umount_key#13/1){+.+.+.}, at: [<c011e84c>] sget+0x248/0x574

  stack backtrace:
    rpc_wait_bit_killable
    __wait_on_bit
    out_of_line_wait_on_bit
    __rpc_execute
    rpc_run_task
    rpc_call_sync
    nfs_proc_get_root
    nfs_get_root
    nfs_fs_mount_common
    nfs_try_mount
    nfs_fs_mount
    mount_fs
    vfs_kern_mount
    do_mount
    sys_mount
    do_mount_root
    mount_root
    prepare_namespace
    kernel_init_freeable
    kernel_init

Although the rootfs mounts, the system is unstable.  Here's a transcript
from a PM test:

  http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt

Here's what the test log should look like:

  http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt

Mailing list discussion is here:

  http://lkml.org/lkml/2013/3/4/221

Deal with this for v3.9 by reverting the problem commit, until folks can
figure out the right long-term course of action.

Signed-off-by: Paul Walmsley <paul@pwsan.com>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: <maciej.rutecki@gmail.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ben Chan <benchan@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debug_locks.h |  4 ++--
 include/linux/freezer.h     |  3 ---
 kernel/exit.c               |  2 +-
 kernel/lockdep.c            | 17 +++++++++--------
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index a975de1ff59..3bd46f76675 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -51,7 +51,7 @@ struct task_struct;
 extern void debug_show_all_locks(void);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void debug_check_no_locks_held(void);
+extern void debug_check_no_locks_held(struct task_struct *task);
 #else
 static inline void debug_show_all_locks(void)
 {
@@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len)
 }
 
 static inline void
-debug_check_no_locks_held(void)
+debug_check_no_locks_held(struct task_struct *task)
 {
 }
 #endif
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 043a5cf8b5b..e70df40d84f 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -3,7 +3,6 @@
 #ifndef FREEZER_H_INCLUDED
 #define FREEZER_H_INCLUDED
 
-#include <linux/debug_locks.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
@@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void);
 
 static inline bool try_to_freeze(void)
 {
-	if (!(current->flags & PF_NOFREEZE))
-		debug_check_no_locks_held();
 	might_sleep();
 	if (likely(!freezing(current)))
 		return false;
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca993..60bc027c61c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
 	/*
 	 * Make sure we are holding no locks:
 	 */
-	debug_check_no_locks_held();
+	debug_check_no_locks_held(tsk);
 	/*
 	 * We can do this unlocked here. The futex code uses this flag
 	 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 259db207b5d..8a0efac4f99 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 
-static void print_held_locks_bug(void)
+static void print_held_locks_bug(struct task_struct *curr)
 {
 	if (!debug_locks_off())
 		return;
@@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void)
 
 	printk("\n");
 	printk("=====================================\n");
-	printk("[ BUG: %s/%d still has locks held! ]\n",
-	       current->comm, task_pid_nr(current));
+	printk("[ BUG: lock held at task exit time! ]\n");
 	print_kernel_ident();
 	printk("-------------------------------------\n");
-	lockdep_print_held_locks(current);
+	printk("%s/%d is exiting with locks still held!\n",
+		curr->comm, task_pid_nr(curr));
+	lockdep_print_held_locks(curr);
+
 	printk("\nstack backtrace:\n");
 	dump_stack();
 }
 
-void debug_check_no_locks_held(void)
+void debug_check_no_locks_held(struct task_struct *task)
 {
-	if (unlikely(current->lockdep_depth > 0))
-		print_held_locks_bug();
+	if (unlikely(task->lockdep_depth > 0))
+		print_held_locks_bug(task);
 }
-EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 
 void debug_show_all_locks(void)
 {
-- 
cgit v1.2.3-70-g09d2