From 0ce8974d504913a0f0ae2d97b20a5ac665431a41 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 6 Jan 2012 03:13:27 -0800
Subject: sysctl: Consolidate !CONFIG_SYSCTL handling

- In sysctl.h move functions only available if CONFIG_SYSCL
  is defined inside of #ifdef CONFIG_SYSCTL

- Move the stub function definitions for !CONFIG_SYSCTL
  into sysctl.h and make them static inlines.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sysctl.c | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05..d5bbddd0de2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2017,32 +2017,6 @@ void setup_sysctl_set(struct ctl_table_set *p,
 	p->is_seen = is_seen;
 }
 
-#else /* !CONFIG_SYSCTL */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-{
-	return NULL;
-}
-
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						    struct ctl_table *table)
-{
-	return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
-	struct ctl_table_set *parent,
-	int (*is_seen)(struct ctl_table_set *))
-{
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-}
-
 #endif /* CONFIG_SYSCTL */
 
 /*
-- 
cgit v1.2.3-70-g09d2


From de4e83bd6b5e16d491ec068cd22801d5d063b07a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 6 Jan 2012 03:34:20 -0800
Subject: sysctl: Register the base sysctl table like any other sysctl table.

Simplify the code by treating the base sysctl table like any other
sysctl table and register it with register_sysctl_table.

To ensure this table is registered early enough to avoid problems
call sysctl_init from proc_sys_init.

Rename sysctl_net.c:sysctl_init() to net_sysctl_init() to avoid
name conflicts now that kernel/sysctl.c:sysctl_init() is no longer
static.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  |  3 ++-
 include/linux/sysctl.h |  1 +
 kernel/sysctl.c        | 13 ++++---------
 net/sysctl_net.c       |  4 ++--
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d82f4a8b4b8..9d29d28af57 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -468,5 +468,6 @@ int __init proc_sys_init(void)
 	proc_sys_root->proc_iops = &proc_sys_dir_operations;
 	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
-	return 0;
+
+	return sysctl_init();
 }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cf3ee7f246d..5e3532e9599 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1095,6 +1095,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
+extern int sysctl_init(void);
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d5bbddd0de2..ad460248acc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -192,7 +192,7 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 
 #endif
 
-static struct ctl_table root_table[];
+static struct ctl_table root_table[1];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
 	{{.count = 1,
@@ -222,7 +222,7 @@ int sysctl_legacy_va_layout;
 
 /* The default sysctl tables: */
 
-static struct ctl_table root_table[] = {
+static struct ctl_table sysctl_base_table[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
@@ -1747,17 +1747,12 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 	}
 }
 
-static __init int sysctl_init(void)
+int __init sysctl_init(void)
 {
-	sysctl_set_parent(NULL, root_table);
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-	sysctl_check_table(current->nsproxy, root_table);
-#endif
+	register_sysctl_table(sysctl_base_table);
 	return 0;
 }
 
-core_initcall(sysctl_init);
-
 static struct ctl_table *is_branch_in(struct ctl_table *branch,
 				      struct ctl_table *table)
 {
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index e75813904f2..a6bbee2bc71 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -90,7 +90,7 @@ static struct pernet_operations sysctl_pernet_ops = {
 	.exit = sysctl_net_exit,
 };
 
-static __init int sysctl_init(void)
+static __init int net_sysctl_init(void)
 {
 	int ret;
 	ret = register_pernet_subsys(&sysctl_pernet_ops);
@@ -102,7 +102,7 @@ static __init int sysctl_init(void)
 out:
 	return ret;
 }
-subsys_initcall(sysctl_init);
+subsys_initcall(net_sysctl_init);
 
 struct ctl_table_header *register_net_sysctl_table(struct net *net,
 	const struct ctl_path *path, struct ctl_table *table)
-- 
cgit v1.2.3-70-g09d2


From 1f87f0b52b1d6581168cb80f86746bc4df918d01 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 6 Jan 2012 04:07:15 -0800
Subject: sysctl: Move the implementation into fs/proc/proc_sysctl.c

Move the core sysctl code from kernel/sysctl.c and kernel/sysctl_check.c
into fs/proc/proc_sysctl.c.

Currently sysctl maintenance is hampered by the sysctl implementation
being split across 3 files with artificial layering between them.
Consolidate the entire sysctl implementation into 1 file so that
it is easier to see what is going on and hopefully allowing for
simpler maintenance.

For functions that are now only used in fs/proc/proc_sysctl.c remove
their declarations from sysctl.h and make them static in fs/proc/proc_sysctl.c

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/internal.h     |   3 +
 fs/proc/proc_sysctl.c  | 622 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h |  16 --
 kernel/Makefile        |   1 -
 kernel/sysctl.c        | 464 ------------------------------------
 kernel/sysctl_check.c  | 160 -------------
 6 files changed, 625 insertions(+), 641 deletions(-)
 delete mode 100644 kernel/sysctl_check.c

(limited to 'kernel')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 292577531ad..3b5ecd960d6 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,12 +10,15 @@
  */
 
 #include <linux/proc_fs.h>
+struct  ctl_table_header;
 
 extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
 #endif
 #ifdef CONFIG_NET
 extern int proc_net_init(void);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9d29d28af57..06e6f10ee8e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -7,6 +7,7 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
+#include <linux/module.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -24,6 +25,209 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
 	wake_up_interruptible(&poll->wait);
 }
 
+static struct ctl_table root_table[1];
+static struct ctl_table_root sysctl_table_root;
+static struct ctl_table_header root_table_header = {
+	{{.count = 1,
+	.ctl_table = root_table,
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
+	.root = &sysctl_table_root,
+	.set = &sysctl_table_root.default_set,
+};
+static struct ctl_table_root sysctl_table_root = {
+	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+	if (unlikely(p->unregistering))
+		return 0;
+	p->used++;
+	return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+	if (!--p->used)
+		if (unlikely(p->unregistering))
+			complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+	/*
+	 * if p->used is 0, nobody will ever touch that entry again;
+	 * we'll eliminate all paths to it before dropping sysctl_lock
+	 */
+	if (unlikely(p->used)) {
+		struct completion wait;
+		init_completion(&wait);
+		p->unregistering = &wait;
+		spin_unlock(&sysctl_lock);
+		wait_for_completion(&wait);
+		spin_lock(&sysctl_lock);
+	} else {
+		/* anything non-NULL; we'll never dereference it */
+		p->unregistering = ERR_PTR(-EINVAL);
+	}
+	/*
+	 * do not remove from the list until nobody holds it; walking the
+	 * list in do_sysctl() relies on that.
+	 */
+	list_del_init(&p->ctl_entry);
+}
+
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	if (!--head->count)
+		kfree_rcu(head, rcu);
+	spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+	if (!head)
+		BUG();
+	spin_lock(&sysctl_lock);
+	if (!use_table(head))
+		head = ERR_PTR(-ENOENT);
+	spin_unlock(&sysctl_lock);
+	return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+	if (!head)
+		return;
+	spin_lock(&sysctl_lock);
+	unuse_table(head);
+	spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = &root->default_set;
+	if (root->lookup)
+		set = root->lookup(root, namespaces);
+	return set;
+}
+
+static struct list_head *
+lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = lookup_header_set(root, namespaces);
+	return &set->list;
+}
+
+static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+						struct ctl_table_header *prev)
+{
+	struct ctl_table_root *root;
+	struct list_head *header_list;
+	struct ctl_table_header *head;
+	struct list_head *tmp;
+
+	spin_lock(&sysctl_lock);
+	if (prev) {
+		head = prev;
+		tmp = &prev->ctl_entry;
+		unuse_table(prev);
+		goto next;
+	}
+	tmp = &root_table_header.ctl_entry;
+	for (;;) {
+		head = list_entry(tmp, struct ctl_table_header, ctl_entry);
+
+		if (!use_table(head))
+			goto next;
+		spin_unlock(&sysctl_lock);
+		return head;
+	next:
+		root = head->root;
+		tmp = tmp->next;
+		header_list = lookup_header_list(root, namespaces);
+		if (tmp != header_list)
+			continue;
+
+		do {
+			root = list_entry(root->root_list.next,
+					struct ctl_table_root, root_list);
+			if (root == &sysctl_table_root)
+				goto out;
+			header_list = lookup_header_list(root, namespaces);
+		} while (list_empty(header_list));
+		tmp = header_list->next;
+	}
+out:
+	spin_unlock(&sysctl_lock);
+	return NULL;
+}
+
+static struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+	return __sysctl_head_next(current->nsproxy, prev);
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+	spin_lock(&sysctl_lock);
+	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
+	spin_unlock(&sysctl_lock);
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+	if (!current_euid())
+		mode >>= 6;
+	else if (in_egroup_p(0))
+		mode >>= 3;
+	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+		return 0;
+	return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+	int mode;
+
+	if (root->permissions)
+		mode = root->permissions(root, current->nsproxy, table);
+	else
+		mode = table->mode;
+
+	return test_perm(mode, op);
+}
+
+static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
+{
+	for (; table->procname; table++) {
+		table->parent = parent;
+		if (table->child)
+			sysctl_set_parent(table, table->child);
+	}
+}
+
+
 static struct inode *proc_sys_make_inode(struct super_block *sb,
 		struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -435,6 +639,21 @@ static int proc_sys_delete(const struct dentry *dentry)
 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+	struct ctl_table_set *set = p->set;
+	int res;
+	spin_lock(&sysctl_lock);
+	if (p->unregistering)
+		res = 0;
+	else if (!set->is_seen)
+		res = 1;
+	else
+		res = set->is_seen(set);
+	spin_unlock(&sysctl_lock);
+	return res;
+}
+
 static int proc_sys_compare(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -460,6 +679,409 @@ static const struct dentry_operations proc_sys_dentry_operations = {
 	.d_compare	= proc_sys_compare,
 };
 
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+				      struct ctl_table *table)
+{
+	struct ctl_table *p;
+	const char *s = branch->procname;
+
+	/* branch should have named subdirectory as its first element */
+	if (!s || !branch->child)
+		return NULL;
+
+	/* ... and nothing else */
+	if (branch[1].procname)
+		return NULL;
+
+	/* table should contain subdirectory with the same name */
+	for (p = table; p->procname; p++) {
+		if (!p->child)
+			continue;
+		if (p->procname && strcmp(p->procname, s) == 0)
+			return p;
+	}
+	return NULL;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+	struct ctl_table *next;
+	int is_better = 0;
+	int not_in_parent = !p->attached_by;
+
+	while ((next = is_branch_in(by, to)) != NULL) {
+		if (by == q->attached_by)
+			is_better = 1;
+		if (to == p->attached_by)
+			not_in_parent = 1;
+		by = by->child;
+		to = next->child;
+	}
+
+	if (is_better && not_in_parent) {
+		q->attached_by = by;
+		q->attached_to = to;
+		q->parent = p;
+	}
+}
+
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+static int sysctl_depth(struct ctl_table *table)
+{
+	struct ctl_table *tmp;
+	int depth;
+
+	depth = 0;
+	for (tmp = table; tmp->parent; tmp = tmp->parent)
+		depth++;
+
+	return depth;
+}
+
+static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
+{
+	int i;
+
+	for (i = 0; table && i < n; i++)
+		table = table->parent;
+
+	return table;
+}
+
+
+static void sysctl_print_path(struct ctl_table *table)
+{
+	struct ctl_table *tmp;
+	int depth, i;
+	depth = sysctl_depth(table);
+	if (table->procname) {
+		for (i = depth; i >= 0; i--) {
+			tmp = sysctl_parent(table, i);
+			printk("/%s", tmp->procname?tmp->procname:"");
+		}
+	}
+	printk(" ");
+}
+
+static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
+						struct ctl_table *table)
+{
+	struct ctl_table_header *head;
+	struct ctl_table *ref, *test;
+	int depth, cur_depth;
+
+	depth = sysctl_depth(table);
+
+	for (head = __sysctl_head_next(namespaces, NULL); head;
+	     head = __sysctl_head_next(namespaces, head)) {
+		cur_depth = depth;
+		ref = head->ctl_table;
+repeat:
+		test = sysctl_parent(table, cur_depth);
+		for (; ref->procname; ref++) {
+			int match = 0;
+			if (cur_depth && !ref->child)
+				continue;
+
+			if (test->procname && ref->procname &&
+			    (strcmp(test->procname, ref->procname) == 0))
+					match++;
+
+			if (match) {
+				if (cur_depth != 0) {
+					cur_depth--;
+					ref = ref->child;
+					goto repeat;
+				}
+				goto out;
+			}
+		}
+	}
+	ref = NULL;
+out:
+	sysctl_head_finish(head);
+	return ref;
+}
+
+static void set_fail(const char **fail, struct ctl_table *table, const char *str)
+{
+	if (*fail) {
+		printk(KERN_ERR "sysctl table check failed: ");
+		sysctl_print_path(table);
+		printk(" %s\n", *fail);
+		dump_stack();
+	}
+	*fail = str;
+}
+
+static void sysctl_check_leaf(struct nsproxy *namespaces,
+				struct ctl_table *table, const char **fail)
+{
+	struct ctl_table *ref;
+
+	ref = sysctl_check_lookup(namespaces, table);
+	if (ref && (ref != table))
+		set_fail(fail, table, "Sysctl already exists");
+}
+
+static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+{
+	int error = 0;
+	for (; table->procname; table++) {
+		const char *fail = NULL;
+
+		if (table->parent) {
+			if (!table->parent->procname)
+				set_fail(&fail, table, "Parent without procname");
+		}
+		if (table->child) {
+			if (table->data)
+				set_fail(&fail, table, "Directory with data?");
+			if (table->maxlen)
+				set_fail(&fail, table, "Directory with maxlen?");
+			if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
+				set_fail(&fail, table, "Writable sysctl directory");
+			if (table->proc_handler)
+				set_fail(&fail, table, "Directory with proc_handler");
+			if (table->extra1)
+				set_fail(&fail, table, "Directory with extra1");
+			if (table->extra2)
+				set_fail(&fail, table, "Directory with extra2");
+		} else {
+			if ((table->proc_handler == proc_dostring) ||
+			    (table->proc_handler == proc_dointvec) ||
+			    (table->proc_handler == proc_dointvec_minmax) ||
+			    (table->proc_handler == proc_dointvec_jiffies) ||
+			    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+			    (table->proc_handler == proc_dointvec_ms_jiffies) ||
+			    (table->proc_handler == proc_doulongvec_minmax) ||
+			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+				if (!table->data)
+					set_fail(&fail, table, "No data");
+				if (!table->maxlen)
+					set_fail(&fail, table, "No maxlen");
+			}
+#ifdef CONFIG_PROC_SYSCTL
+			if (!table->proc_handler)
+				set_fail(&fail, table, "No proc_handler");
+#endif
+			sysctl_check_leaf(namespaces, table, &fail);
+		}
+		if (table->mode > 0777)
+			set_fail(&fail, table, "bogus .mode");
+		if (fail) {
+			set_fail(&fail, table, NULL);
+			error = -EINVAL;
+		}
+		if (table->child)
+			error |= sysctl_check_table(namespaces, table->child);
+	}
+	return error;
+}
+#endif /* CONFIG_SYSCTL_SYSCALL_CHECK */
+
+/**
+ * __register_sysctl_paths - register a sysctl hierarchy
+ * @root: List of sysctl headers to register on
+ * @namespaces: Data to compute which lists of sysctl entries are visible
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+	struct ctl_table_root *root,
+	struct nsproxy *namespaces,
+	const struct ctl_path *path, struct ctl_table *table)
+{
+	struct ctl_table_header *header;
+	struct ctl_table *new, **prevp;
+	unsigned int n, npath;
+	struct ctl_table_set *set;
+
+	/* Count the path components */
+	for (npath = 0; path[npath].procname; ++npath)
+		;
+
+	/*
+	 * For each path component, allocate a 2-element ctl_table array.
+	 * The first array element will be filled with the sysctl entry
+	 * for this, the second will be the sentinel (procname == 0).
+	 *
+	 * We allocate everything in one go so that we don't have to
+	 * worry about freeing additional memory in unregister_sysctl_table.
+	 */
+	header = kzalloc(sizeof(struct ctl_table_header) +
+			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+	if (!header)
+		return NULL;
+
+	new = (struct ctl_table *) (header + 1);
+
+	/* Now connect the dots */
+	prevp = &header->ctl_table;
+	for (n = 0; n < npath; ++n, ++path) {
+		/* Copy the procname */
+		new->procname = path->procname;
+		new->mode     = 0555;
+
+		*prevp = new;
+		prevp = &new->child;
+
+		new += 2;
+	}
+	*prevp = table;
+	header->ctl_table_arg = table;
+
+	INIT_LIST_HEAD(&header->ctl_entry);
+	header->used = 0;
+	header->unregistering = NULL;
+	header->root = root;
+	sysctl_set_parent(NULL, header->ctl_table);
+	header->count = 1;
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+	if (sysctl_check_table(namespaces, header->ctl_table)) {
+		kfree(header);
+		return NULL;
+	}
+#endif
+	spin_lock(&sysctl_lock);
+	header->set = lookup_header_set(root, namespaces);
+	header->attached_by = header->ctl_table;
+	header->attached_to = root_table;
+	header->parent = &root_table_header;
+	for (set = header->set; set; set = set->parent) {
+		struct ctl_table_header *p;
+		list_for_each_entry(p, &set->list, ctl_entry) {
+			if (p->unregistering)
+				continue;
+			try_attach(p, header);
+		}
+	}
+	header->parent->count++;
+	list_add_tail(&header->ctl_entry, &header->set->list);
+	spin_unlock(&sysctl_lock);
+
+	return header;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
+{
+	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
+					path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+	static const struct ctl_path null_path[] = { {} };
+
+	return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+	might_sleep();
+
+	if (header == NULL)
+		return;
+
+	spin_lock(&sysctl_lock);
+	start_unregistering(header);
+	if (!--header->parent->count) {
+		WARN_ON(1);
+		kfree_rcu(header->parent, rcu);
+	}
+	if (!--header->count)
+		kfree_rcu(header, rcu);
+	spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+	INIT_LIST_HEAD(&p->list);
+	p->parent = parent ? parent : &sysctl_table_root.default_set;
+	p->is_seen = is_seen;
+}
+
+
 int __init proc_sys_init(void)
 {
 	struct proc_dir_entry *proc_sys_root;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 5e3532e9599..08cabbfddac 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1073,17 +1073,6 @@ extern void setup_sysctl_set(struct ctl_table_set *p,
 	struct ctl_table_set *parent,
 	int (*is_seen)(struct ctl_table_set *));
 
-extern void sysctl_head_get(struct ctl_table_header *);
-extern void sysctl_head_put(struct ctl_table_header *);
-extern int sysctl_is_seen(struct ctl_table_header *);
-extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *);
-extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
-extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-						struct ctl_table_header *prev);
-extern void sysctl_head_finish(struct ctl_table_header *prev);
-extern int sysctl_perm(struct ctl_table_root *root,
-		struct ctl_table *table, int op);
-
 void register_sysctl_root(struct ctl_table_root *root);
 struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_root *root, struct nsproxy *namespaces,
@@ -1093,7 +1082,6 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 						struct ctl_table *table);
 
 void unregister_sysctl_table(struct ctl_table_header * table);
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
 extern int sysctl_init(void);
 #else /* CONFIG_SYSCTL */
@@ -1118,10 +1106,6 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
 {
 }
 
-static inline void sysctl_head_put(struct ctl_table_header *head)
-{
-}
-
 #endif /* CONFIG_SYSCTL */
 
 #endif /* __KERNEL__ */
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e7..cb41b9547c9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ad460248acc..b774909ed46 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -192,20 +192,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 
 #endif
 
-static struct ctl_table root_table[1];
-static struct ctl_table_root sysctl_table_root;
-static struct ctl_table_header root_table_header = {
-	{{.count = 1,
-	.ctl_table = root_table,
-	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
-	.root = &sysctl_table_root,
-	.set = &sysctl_table_root.default_set,
-};
-static struct ctl_table_root sysctl_table_root = {
-	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
-};
-
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
 static struct ctl_table fs_table[];
@@ -1559,459 +1545,12 @@ static struct ctl_table dev_table[] = {
 	{ }
 };
 
-static DEFINE_SPINLOCK(sysctl_lock);
-
-/* called under sysctl_lock */
-static int use_table(struct ctl_table_header *p)
-{
-	if (unlikely(p->unregistering))
-		return 0;
-	p->used++;
-	return 1;
-}
-
-/* called under sysctl_lock */
-static void unuse_table(struct ctl_table_header *p)
-{
-	if (!--p->used)
-		if (unlikely(p->unregistering))
-			complete(p->unregistering);
-}
-
-/* called under sysctl_lock, will reacquire if has to wait */
-static void start_unregistering(struct ctl_table_header *p)
-{
-	/*
-	 * if p->used is 0, nobody will ever touch that entry again;
-	 * we'll eliminate all paths to it before dropping sysctl_lock
-	 */
-	if (unlikely(p->used)) {
-		struct completion wait;
-		init_completion(&wait);
-		p->unregistering = &wait;
-		spin_unlock(&sysctl_lock);
-		wait_for_completion(&wait);
-		spin_lock(&sysctl_lock);
-	} else {
-		/* anything non-NULL; we'll never dereference it */
-		p->unregistering = ERR_PTR(-EINVAL);
-	}
-	/*
-	 * do not remove from the list until nobody holds it; walking the
-	 * list in do_sysctl() relies on that.
-	 */
-	list_del_init(&p->ctl_entry);
-}
-
-void sysctl_head_get(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	head->count++;
-	spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	if (!--head->count)
-		kfree_rcu(head, rcu);
-	spin_unlock(&sysctl_lock);
-}
-
-struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
-{
-	if (!head)
-		BUG();
-	spin_lock(&sysctl_lock);
-	if (!use_table(head))
-		head = ERR_PTR(-ENOENT);
-	spin_unlock(&sysctl_lock);
-	return head;
-}
-
-void sysctl_head_finish(struct ctl_table_header *head)
-{
-	if (!head)
-		return;
-	spin_lock(&sysctl_lock);
-	unuse_table(head);
-	spin_unlock(&sysctl_lock);
-}
-
-static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-	struct ctl_table_set *set = &root->default_set;
-	if (root->lookup)
-		set = root->lookup(root, namespaces);
-	return set;
-}
-
-static struct list_head *
-lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-	struct ctl_table_set *set = lookup_header_set(root, namespaces);
-	return &set->list;
-}
-
-struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-					    struct ctl_table_header *prev)
-{
-	struct ctl_table_root *root;
-	struct list_head *header_list;
-	struct ctl_table_header *head;
-	struct list_head *tmp;
-
-	spin_lock(&sysctl_lock);
-	if (prev) {
-		head = prev;
-		tmp = &prev->ctl_entry;
-		unuse_table(prev);
-		goto next;
-	}
-	tmp = &root_table_header.ctl_entry;
-	for (;;) {
-		head = list_entry(tmp, struct ctl_table_header, ctl_entry);
-
-		if (!use_table(head))
-			goto next;
-		spin_unlock(&sysctl_lock);
-		return head;
-	next:
-		root = head->root;
-		tmp = tmp->next;
-		header_list = lookup_header_list(root, namespaces);
-		if (tmp != header_list)
-			continue;
-
-		do {
-			root = list_entry(root->root_list.next,
-					struct ctl_table_root, root_list);
-			if (root == &sysctl_table_root)
-				goto out;
-			header_list = lookup_header_list(root, namespaces);
-		} while (list_empty(header_list));
-		tmp = header_list->next;
-	}
-out:
-	spin_unlock(&sysctl_lock);
-	return NULL;
-}
-
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
-{
-	return __sysctl_head_next(current->nsproxy, prev);
-}
-
-void register_sysctl_root(struct ctl_table_root *root)
-{
-	spin_lock(&sysctl_lock);
-	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
-	spin_unlock(&sysctl_lock);
-}
-
-/*
- * sysctl_perm does NOT grant the superuser all rights automatically, because
- * some sysctl variables are readonly even to root.
- */
-
-static int test_perm(int mode, int op)
-{
-	if (!current_euid())
-		mode >>= 6;
-	else if (in_egroup_p(0))
-		mode >>= 3;
-	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
-		return 0;
-	return -EACCES;
-}
-
-int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
-{
-	int mode;
-
-	if (root->permissions)
-		mode = root->permissions(root, current->nsproxy, table);
-	else
-		mode = table->mode;
-
-	return test_perm(mode, op);
-}
-
-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
-	for (; table->procname; table++) {
-		table->parent = parent;
-		if (table->child)
-			sysctl_set_parent(table, table->child);
-	}
-}
-
 int __init sysctl_init(void)
 {
 	register_sysctl_table(sysctl_base_table);
 	return 0;
 }
 
-static struct ctl_table *is_branch_in(struct ctl_table *branch,
-				      struct ctl_table *table)
-{
-	struct ctl_table *p;
-	const char *s = branch->procname;
-
-	/* branch should have named subdirectory as its first element */
-	if (!s || !branch->child)
-		return NULL;
-
-	/* ... and nothing else */
-	if (branch[1].procname)
-		return NULL;
-
-	/* table should contain subdirectory with the same name */
-	for (p = table; p->procname; p++) {
-		if (!p->child)
-			continue;
-		if (p->procname && strcmp(p->procname, s) == 0)
-			return p;
-	}
-	return NULL;
-}
-
-/* see if attaching q to p would be an improvement */
-static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
-{
-	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
-	struct ctl_table *next;
-	int is_better = 0;
-	int not_in_parent = !p->attached_by;
-
-	while ((next = is_branch_in(by, to)) != NULL) {
-		if (by == q->attached_by)
-			is_better = 1;
-		if (to == p->attached_by)
-			not_in_parent = 1;
-		by = by->child;
-		to = next->child;
-	}
-
-	if (is_better && not_in_parent) {
-		q->attached_by = by;
-		q->attached_to = to;
-		q->parent = p;
-	}
-}
-
-/**
- * __register_sysctl_paths - register a sysctl hierarchy
- * @root: List of sysctl headers to register on
- * @namespaces: Data to compute which lists of sysctl entries are visible
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * The members of the &struct ctl_table structure are used as follows:
- *
- * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
- *            enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file, and for sysctl(2)
- *
- * child - a pointer to the child sysctl table if this entry is a directory, or
- *         %NULL.
- *
- * proc_handler - the text handler routine (described below)
- *
- * de - for internal use by the sysctl routines
- *
- * extra1, extra2 - extra pointers usable by the proc handler routines
- *
- * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
- *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table.  The data and maxlen fields of the ctl_table
- * struct enable minimal validation of the values being written to be
- * performed, and the mode field allows minimal authentication.
- *
- * There must be a proc_handler routine for any terminal nodes
- * mirrored under /proc/sys (non-terminals are handled by a built-in
- * directory handler).  Several default handlers are available to
- * cover common cases -
- *
- * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
- * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
- * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
- *
- * It is the handler's job to read the input buffer from user memory
- * and process it. The handler should return 0 on success.
- *
- * This routine returns %NULL on a failure to register, and a pointer
- * to the table header on success.
- */
-struct ctl_table_header *__register_sysctl_paths(
-	struct ctl_table_root *root,
-	struct nsproxy *namespaces,
-	const struct ctl_path *path, struct ctl_table *table)
-{
-	struct ctl_table_header *header;
-	struct ctl_table *new, **prevp;
-	unsigned int n, npath;
-	struct ctl_table_set *set;
-
-	/* Count the path components */
-	for (npath = 0; path[npath].procname; ++npath)
-		;
-
-	/*
-	 * For each path component, allocate a 2-element ctl_table array.
-	 * The first array element will be filled with the sysctl entry
-	 * for this, the second will be the sentinel (procname == 0).
-	 *
-	 * We allocate everything in one go so that we don't have to
-	 * worry about freeing additional memory in unregister_sysctl_table.
-	 */
-	header = kzalloc(sizeof(struct ctl_table_header) +
-			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
-	if (!header)
-		return NULL;
-
-	new = (struct ctl_table *) (header + 1);
-
-	/* Now connect the dots */
-	prevp = &header->ctl_table;
-	for (n = 0; n < npath; ++n, ++path) {
-		/* Copy the procname */
-		new->procname = path->procname;
-		new->mode     = 0555;
-
-		*prevp = new;
-		prevp = &new->child;
-
-		new += 2;
-	}
-	*prevp = table;
-	header->ctl_table_arg = table;
-
-	INIT_LIST_HEAD(&header->ctl_entry);
-	header->used = 0;
-	header->unregistering = NULL;
-	header->root = root;
-	sysctl_set_parent(NULL, header->ctl_table);
-	header->count = 1;
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-	if (sysctl_check_table(namespaces, header->ctl_table)) {
-		kfree(header);
-		return NULL;
-	}
-#endif
-	spin_lock(&sysctl_lock);
-	header->set = lookup_header_set(root, namespaces);
-	header->attached_by = header->ctl_table;
-	header->attached_to = root_table;
-	header->parent = &root_table_header;
-	for (set = header->set; set; set = set->parent) {
-		struct ctl_table_header *p;
-		list_for_each_entry(p, &set->list, ctl_entry) {
-			if (p->unregistering)
-				continue;
-			try_attach(p, header);
-		}
-	}
-	header->parent->count++;
-	list_add_tail(&header->ctl_entry, &header->set->list);
-	spin_unlock(&sysctl_lock);
-
-	return header;
-}
-
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table)
-{
-	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
-					path, table);
-}
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
-	static const struct ctl_path null_path[] = { {} };
-
-	return register_sysctl_paths(null_path, table);
-}
-
-/**
- * unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
- *
- * Unregisters the sysctl table and all children. proc entries may not
- * actually be removed until they are no longer used by anyone.
- */
-void unregister_sysctl_table(struct ctl_table_header * header)
-{
-	might_sleep();
-
-	if (header == NULL)
-		return;
-
-	spin_lock(&sysctl_lock);
-	start_unregistering(header);
-	if (!--header->parent->count) {
-		WARN_ON(1);
-		kfree_rcu(header->parent, rcu);
-	}
-	if (!--header->count)
-		kfree_rcu(header, rcu);
-	spin_unlock(&sysctl_lock);
-}
-
-int sysctl_is_seen(struct ctl_table_header *p)
-{
-	struct ctl_table_set *set = p->set;
-	int res;
-	spin_lock(&sysctl_lock);
-	if (p->unregistering)
-		res = 0;
-	else if (!set->is_seen)
-		res = 1;
-	else
-		res = set->is_seen(set);
-	spin_unlock(&sysctl_lock);
-	return res;
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
-	struct ctl_table_set *parent,
-	int (*is_seen)(struct ctl_table_set *))
-{
-	INIT_LIST_HEAD(&p->list);
-	p->parent = parent ? parent : &sysctl_table_root.default_set;
-	p->is_seen = is_seen;
-}
-
 #endif /* CONFIG_SYSCTL */
 
 /*
@@ -2977,6 +2516,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
-EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813..00000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <net/ip_vs.h>
-
-
-static int sysctl_depth(struct ctl_table *table)
-{
-	struct ctl_table *tmp;
-	int depth;
-
-	depth = 0;
-	for (tmp = table; tmp->parent; tmp = tmp->parent)
-		depth++;
-
-	return depth;
-}
-
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
-{
-	int i;
-
-	for (i = 0; table && i < n; i++)
-		table = table->parent;
-
-	return table;
-}
-
-
-static void sysctl_print_path(struct ctl_table *table)
-{
-	struct ctl_table *tmp;
-	int depth, i;
-	depth = sysctl_depth(table);
-	if (table->procname) {
-		for (i = depth; i >= 0; i--) {
-			tmp = sysctl_parent(table, i);
-			printk("/%s", tmp->procname?tmp->procname:"");
-		}
-	}
-	printk(" ");
-}
-
-static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
-						struct ctl_table *table)
-{
-	struct ctl_table_header *head;
-	struct ctl_table *ref, *test;
-	int depth, cur_depth;
-
-	depth = sysctl_depth(table);
-
-	for (head = __sysctl_head_next(namespaces, NULL); head;
-	     head = __sysctl_head_next(namespaces, head)) {
-		cur_depth = depth;
-		ref = head->ctl_table;
-repeat:
-		test = sysctl_parent(table, cur_depth);
-		for (; ref->procname; ref++) {
-			int match = 0;
-			if (cur_depth && !ref->child)
-				continue;
-
-			if (test->procname && ref->procname &&
-			    (strcmp(test->procname, ref->procname) == 0))
-					match++;
-
-			if (match) {
-				if (cur_depth != 0) {
-					cur_depth--;
-					ref = ref->child;
-					goto repeat;
-				}
-				goto out;
-			}
-		}
-	}
-	ref = NULL;
-out:
-	sysctl_head_finish(head);
-	return ref;
-}
-
-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
-{
-	if (*fail) {
-		printk(KERN_ERR "sysctl table check failed: ");
-		sysctl_print_path(table);
-		printk(" %s\n", *fail);
-		dump_stack();
-	}
-	*fail = str;
-}
-
-static void sysctl_check_leaf(struct nsproxy *namespaces,
-				struct ctl_table *table, const char **fail)
-{
-	struct ctl_table *ref;
-
-	ref = sysctl_check_lookup(namespaces, table);
-	if (ref && (ref != table))
-		set_fail(fail, table, "Sysctl already exists");
-}
-
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
-{
-	int error = 0;
-	for (; table->procname; table++) {
-		const char *fail = NULL;
-
-		if (table->parent) {
-			if (!table->parent->procname)
-				set_fail(&fail, table, "Parent without procname");
-		}
-		if (table->child) {
-			if (table->data)
-				set_fail(&fail, table, "Directory with data?");
-			if (table->maxlen)
-				set_fail(&fail, table, "Directory with maxlen?");
-			if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
-				set_fail(&fail, table, "Writable sysctl directory");
-			if (table->proc_handler)
-				set_fail(&fail, table, "Directory with proc_handler");
-			if (table->extra1)
-				set_fail(&fail, table, "Directory with extra1");
-			if (table->extra2)
-				set_fail(&fail, table, "Directory with extra2");
-		} else {
-			if ((table->proc_handler == proc_dostring) ||
-			    (table->proc_handler == proc_dointvec) ||
-			    (table->proc_handler == proc_dointvec_minmax) ||
-			    (table->proc_handler == proc_dointvec_jiffies) ||
-			    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
-			    (table->proc_handler == proc_dointvec_ms_jiffies) ||
-			    (table->proc_handler == proc_doulongvec_minmax) ||
-			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-				if (!table->data)
-					set_fail(&fail, table, "No data");
-				if (!table->maxlen)
-					set_fail(&fail, table, "No maxlen");
-			}
-#ifdef CONFIG_PROC_SYSCTL
-			if (!table->proc_handler)
-				set_fail(&fail, table, "No proc_handler");
-#endif
-			sysctl_check_leaf(namespaces, table, &fail);
-		}
-		if (table->mode > 0777)
-			set_fail(&fail, table, "bogus .mode");
-		if (fail) {
-			set_fail(&fail, table, NULL);
-			error = -EINVAL;
-		}
-		if (table->child)
-			error |= sysctl_check_table(namespaces, table->child);
-	}
-	return error;
-}
-- 
cgit v1.2.3-70-g09d2


From a80b83b7b8456e9b475346c2e01d7e210883208c Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 3 Feb 2012 00:19:07 -0800
Subject: Input: add infrastructure for selecting clockid for event time stamps

As noted by Arve and others, since wall time can jump backwards, it is
difficult to use for input because one cannot determine if one event
occurred before another or for how long a key was pressed.

However, the timestamp field is part of the kernel ABI, and cannot be
changed without possibly breaking existing users.

This patch adds a new IOCTL that allows a clockid to be set in the
evdev_client struct that will specify which time base to use for event
timestamps (ie: CLOCK_MONOTONIC instead of CLOCK_REALTIME).

For now we only support CLOCK_MONOTONIC and CLOCK_REALTIME, but
in the future we could support other clockids if appropriate.

The default remains CLOCK_REALTIME, so we don't change the ABI.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Daniel Kurtz <djkurtz@google.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c     | 25 +++++++++++++++++++++----
 include/linux/input.h     |  2 ++
 kernel/time/timekeeping.c |  2 ++
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 76457d50bc3..c1740974299 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -46,6 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
+	int clkid;
 	unsigned int bufsize;
 	struct input_event buffer[];
 };
@@ -54,8 +55,12 @@ static struct evdev *evdev_table[EVDEV_MINORS];
 static DEFINE_MUTEX(evdev_table_mutex);
 
 static void evdev_pass_event(struct evdev_client *client,
-			     struct input_event *event)
+			     struct input_event *event,
+			     ktime_t mono, ktime_t real)
 {
+	event->time = ktime_to_timeval(client->clkid == CLOCK_MONOTONIC ?
+					mono : real);
+
 	/* Interrupts are disabled, just acquire the lock. */
 	spin_lock(&client->buffer_lock);
 
@@ -94,8 +99,11 @@ static void evdev_event(struct input_handle *handle,
 	struct evdev *evdev = handle->private;
 	struct evdev_client *client;
 	struct input_event event;
+	ktime_t time_mono, time_real;
+
+	time_mono = ktime_get();
+	time_real = ktime_sub(time_mono, ktime_get_monotonic_offset());
 
-	do_gettimeofday(&event.time);
 	event.type = type;
 	event.code = code;
 	event.value = value;
@@ -103,11 +111,12 @@ static void evdev_event(struct input_handle *handle,
 	rcu_read_lock();
 
 	client = rcu_dereference(evdev->grab);
+
 	if (client)
-		evdev_pass_event(client, &event);
+		evdev_pass_event(client, &event, time_mono, time_real);
 	else
 		list_for_each_entry_rcu(client, &evdev->client_list, node)
-			evdev_pass_event(client, &event);
+			evdev_pass_event(client, &event, time_mono, time_real);
 
 	rcu_read_unlock();
 
@@ -685,6 +694,14 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 		else
 			return evdev_ungrab(evdev, client);
 
+	case EVIOCSCLOCKID:
+		if (copy_from_user(&i, p, sizeof(unsigned int)))
+			return -EFAULT;
+		if (i != CLOCK_MONOTONIC && i != CLOCK_REALTIME)
+			return -EINVAL;
+		client->clkid = i;
+		return 0;
+
 	case EVIOCGKEYCODE:
 		return evdev_handle_get_keycode(dev, p);
 
diff --git a/include/linux/input.h b/include/linux/input.h
index 3862e32c4ee..177261ea6f5 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -129,6 +129,8 @@ struct input_keymap_entry {
 
 #define EVIOCGRAB		_IOW('E', 0x90, int)			/* Grab/Release device */
 
+#define EVIOCSCLOCKID		_IOW('E', 0xa0, int)			/* Set clockid to be used for timestamps */
+
 /*
  * Device properties and quirks
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2b021b0e850..16947999475 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1140,6 +1140,8 @@ ktime_t ktime_get_monotonic_offset(void)
 	} while (read_seqretry(&xtime_lock, seq));
 	return timespec_to_ktime(wtom);
 }
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
+
 
 /**
  * xtime_update() - advances the timekeeping infrastructure
-- 
cgit v1.2.3-70-g09d2


From 13ae246db4a02971ef4f557af1f6d3e21d64b710 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 29 Jan 2012 15:44:45 -0500
Subject: includecheck: delete any duplicate instances of module.h

Different tree maintainers picked up independently generated
trivial compile fixes based on linux-next testing, resulting
in some cases where a file would have got more than one addition
of module.h once everything was all merged together.

Delete any duplicates so includecheck isn't complaining about
anything related to module.h/export.h changes.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 arch/blackfin/mach-bf537/boards/pnav10.c  | 1 -
 drivers/dma/imx-dma.c                     | 1 -
 drivers/dma/imx-sdma.c                    | 1 -
 drivers/media/video/adp1653.c             | 1 -
 drivers/mmc/host/sdhci-tegra.c            | 1 -
 drivers/power/max8998_charger.c           | 1 -
 drivers/staging/iio/dac/ad5686.c          | 1 -
 drivers/staging/iio/gyro/adis16060_core.c | 1 -
 drivers/staging/sm7xx/smtcfb.c            | 1 -
 drivers/usb/dwc3/core.c                   | 1 -
 drivers/usb/dwc3/dwc3-omap.c              | 1 -
 kernel/params.c                           | 1 -
 12 files changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/blackfin/mach-bf537/boards/pnav10.c b/arch/blackfin/mach-bf537/boards/pnav10.c
index 6fd84709fc6..7d15a3024e4 100644
--- a/arch/blackfin/mach-bf537/boards/pnav10.c
+++ b/arch/blackfin/mach-bf537/boards/pnav10.c
@@ -101,7 +101,6 @@ static struct platform_device smc91x_device = {
 
 #if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
 #include <linux/bfin_mac.h>
-#include <linux/export.h>
 static const unsigned short bfin_mac_peripherals[] = P_RMII0;
 
 static struct bfin_phydev_platform_data bfin_phydev_data[] = {
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index e4383ee2c9a..38586ba8da9 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -14,7 +14,6 @@
  * http://www.gnu.org/copyleft/gpl.html
  */
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 8bc5acf36ee..63540d3e215 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -35,7 +35,6 @@
 #include <linux/dmaengine.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/module.h>
 
 #include <asm/irq.h>
 #include <mach/sdma.h>
diff --git a/drivers/media/video/adp1653.c b/drivers/media/video/adp1653.c
index 12eedf4d515..6e7d094fa2b 100644
--- a/drivers/media/video/adp1653.c
+++ b/drivers/media/video/adp1653.c
@@ -33,7 +33,6 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/version.h>
 #include <media/adp1653.h>
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index 78a36eba4df..cb348569454 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -23,7 +23,6 @@
 #include <linux/gpio.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
-#include <linux/module.h>
 
 #include <asm/gpio.h>
 
diff --git a/drivers/power/max8998_charger.c b/drivers/power/max8998_charger.c
index 9b3f2bf56e7..6dc01c25559 100644
--- a/drivers/power/max8998_charger.c
+++ b/drivers/power/max8998_charger.c
@@ -19,7 +19,6 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/staging/iio/dac/ad5686.c b/drivers/staging/iio/dac/ad5686.c
index ce2d6193dd8..2415a6e60c7 100644
--- a/drivers/staging/iio/dac/ad5686.c
+++ b/drivers/staging/iio/dac/ad5686.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/regulator/consumer.h>
-#include <linux/module.h>
 
 #include "../iio.h"
 #include "../sysfs.h"
diff --git a/drivers/staging/iio/gyro/adis16060_core.c b/drivers/staging/iio/gyro/adis16060_core.c
index c0ca7093e0e..02cc23420b9 100644
--- a/drivers/staging/iio/gyro/adis16060_core.c
+++ b/drivers/staging/iio/gyro/adis16060_core.c
@@ -14,7 +14,6 @@
 #include <linux/spi/spi.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
-#include <linux/module.h>
 
 #include "../iio.h"
 #include "../sysfs.h"
diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c
index ae0035f327e..1b3e2d0c699 100644
--- a/drivers/staging/sm7xx/smtcfb.c
+++ b/drivers/staging/sm7xx/smtcfb.c
@@ -41,7 +41,6 @@
 
 #ifdef CONFIG_PM
 #include <linux/pm.h>
-#include <linux/module.h>
 #endif
 
 #include "smtcfb.h"
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 7c9df630dbe..eed4a5b6d39 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -51,7 +51,6 @@
 
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
-#include <linux/module.h>
 
 #include "core.h"
 #include "gadget.h"
diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index 3274ac8f120..92cc7b8bc09 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -46,7 +46,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/ioport.h>
 #include <linux/io.h>
-#include <linux/module.h>
 
 #include "core.h"
 #include "io.h"
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1f..47f5bf12434 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
-- 
cgit v1.2.3-70-g09d2


From 9fbe465efc76044dd87afe764db5464ae61aeabc Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 16 Mar 2012 13:17:13 +0100
Subject: kgdb: Respect that flush op is optional

Not all kgdb I/O drivers implement a flush operation. Adjust
gdbstub_exit accordingly.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/gdbstub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad8..5a155742ae9 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1129,5 +1129,6 @@ void gdbstub_exit(int status)
 	dbg_io_ops->write_char(hex_asc_lo(checksum));
 
 	/* make sure the output is flushed, lest the bootloader clobber it */
-	dbg_io_ops->flush();
+	if (dbg_io_ops->flush)
+		dbg_io_ops->flush();
 }
-- 
cgit v1.2.3-70-g09d2


From 2366e047840e33928803c0442176fb3991423da8 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 16 Mar 2012 14:20:41 -0500
Subject: kgdb,debug-core,gdbstub: Hook the reboot notifier for debugger detach

The gdbstub and kdb should get detached if the system is rebooting.
Calling gdbstub_exit() will set the proper debug core state and send a
message to any debugger that is connected to correctly detach.

An attached debugger will receive the exit code from
include/linux/reboot.h based on SYS_HALT, SYS_REBOOT, etc...

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/debug_core.c | 17 +++++++++++++++++
 kernel/debug/gdbstub.c    |  7 +++++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784ef..3c1ad4e0354 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
+#include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
@@ -784,6 +785,20 @@ void __init dbg_late_init(void)
 	kdb_init(KDB_INIT_FULL);
 }
 
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+	if (!dbg_kdb_mode)
+		gdbstub_exit(code);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+	.notifier_call		= dbg_notify_reboot,
+	.next			= NULL,
+	.priority		= INT_MAX,
+};
+
 static void kgdb_register_callbacks(void)
 {
 	if (!kgdb_io_module_registered) {
@@ -791,6 +806,7 @@ static void kgdb_register_callbacks(void)
 		kgdb_arch_init();
 		if (!dbg_is_early)
 			kgdb_arch_late();
+		register_reboot_notifier(&dbg_reboot_notifier);
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +828,7 @@ static void kgdb_unregister_callbacks(void)
 	 */
 	if (kgdb_io_module_registered) {
 		kgdb_io_module_registered = 0;
+		unregister_reboot_notifier(&dbg_reboot_notifier);
 		atomic_notifier_chain_unregister(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 		kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 5a155742ae9..ce615e06448 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
 	unsigned char checksum, ch, buffer[3];
 	int loop;
 
+	if (!kgdb_connected)
+		return;
+	kgdb_connected = 0;
+
+	if (!dbg_io_ops || dbg_kdb_mode)
+		return;
+
 	buffer[0] = 'W';
 	buffer[1] = hex_asc_hi(status);
 	buffer[2] = hex_asc_lo(status);
-- 
cgit v1.2.3-70-g09d2


From 8f30d411767351656ea62c9e7612120f9b870b59 Mon Sep 17 00:00:00 2001
From: Andrei Warkentin <andrey.warkentin@gmail.com>
Date: Tue, 28 Feb 2012 06:55:05 -0600
Subject: KDB: Fix usability issues relating to the 'enter' key.

This fixes the following problems:
1) Typematic-repeat of 'enter' gives warning message
   and leaks make/break if KDB exits. Repeats
   look something like 0x1c 0x1c .... 0x9c
2) Use of 'keypad enter' gives warning message and
   leaks the ENTER break/make code out if KDB exits.
   KP ENTER repeats look someting like 0xe0 0x1c
   0xe0 0x1c ... 0xe0 0x9c.
3) Lag on the order of seconds between "break" and "make" when
   expecting the enter "break" code. Seen under virtualized
   environments such as VMware ESX.

The existing special enter handler tries to glob the enter break code,
but this fails if the other (KP) enter was used, or if there was a key
repeat. It also fails if you mashed some keys along with enter, and
you ended up with a non-enter make or non-enter break code coming
after the enter make code. So first, we modify the handler to handle
these cases. But performing these actions on every enter is annoying
since now you can't hold ENTER down to scroll <more>d messages in
KDB. Since this special behaviour is only necessary to handle the
exiting KDB ('g' + ENTER) without leaking scancodes to the OS.  This
cleanup needs to get executed anytime the kdb_main loop exits.

Tested on QEMU. Set a bp on atkbd.c to verify no scan code was leaked.

Cc: Andrei Warkentin <andreiw@vmware.com>
[jason.wessel@windriver.com: move cleanup calls to kdb_main.c]
Signed-off-by: Andrei Warkentin <andrey.warkentin@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_keyboard.c | 95 +++++++++++++++++++++++++++++++----------
 kernel/debug/kdb/kdb_main.c     |  3 ++
 kernel/debug/kdb/kdb_private.h  |  7 +++
 3 files changed, 83 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c..118527aa60e 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
 #define KBD_STAT_MOUSE_OBF	0x20	/* Mouse output buffer full */
 
 static int kbd_exists;
+static int kbd_last_ret;
 
 /*
  * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
 		return -1;
 	}
 
-	if ((scancode & 0x80) != 0)
+	if ((scancode & 0x80) != 0) {
+		if (scancode == 0x9c)
+			kbd_last_ret = 0;
 		return -1;
+	}
 
 	scancode &= 0x7f;
 
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
 		return -1;	/* ignore unprintables */
 	}
 
-	if ((scancode & 0x7f) == 0x1c) {
-		/*
-		 * enter key.  All done.  Absorb the release scancode.
-		 */
+	if (scancode == 0x1c) {
+		kbd_last_ret = 1;
+		return 13;
+	}
+
+	return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+	int scancode, scanstatus;
+
+	/*
+	 * Nothing to clean up, since either
+	 * ENTER was never pressed, or has already
+	 * gotten cleaned up.
+	 */
+	if (!kbd_last_ret)
+		return;
+
+	kbd_last_ret = 0;
+	/*
+	 * Enter key. Need to absorb the break code here, lest it gets
+	 * leaked out if we exit KDB as the result of processing 'g'.
+	 *
+	 * This has several interesting implications:
+	 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+	 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+	 *   only get a break code at the end of the repeated
+	 *   sequence. This means we can't propagate the repeated key
+	 *   press, and must swallow it away.
+	 * + Need to handle possible PS/2 mouse input.
+	 * + Need to handle mashed keys.
+	 */
+
+	while (1) {
 		while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
-			;
+			cpu_relax();
 
 		/*
-		 * Fetch the scancode
+		 * Fetch the scancode.
 		 */
 		scancode = inb(KBD_DATA_REG);
 		scanstatus = inb(KBD_STATUS_REG);
 
-		while (scanstatus & KBD_STAT_MOUSE_OBF) {
-			scancode = inb(KBD_DATA_REG);
-			scanstatus = inb(KBD_STATUS_REG);
-		}
+		/*
+		 * Skip mouse input.
+		 */
+		if (scanstatus & KBD_STAT_MOUSE_OBF)
+			continue;
 
-		if (scancode != 0x9c) {
-			/*
-			 * Wasn't an enter-release,  why not?
-			 */
-			kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
-			       scancode, scanstatus);
-		}
+		/*
+		 * If we see 0xe0, this is either a break code for KP
+		 * ENTER, or a repeat make for KP ENTER. Either way,
+		 * since the second byte is equivalent to an ENTER,
+		 * skip the 0xe0 and try again.
+		 *
+		 * If we see 0x1c, this must be a repeat ENTER or KP
+		 * ENTER (and we swallowed 0xe0 before). Try again.
+		 *
+		 * We can also see make and break codes for other keys
+		 * mashed before or after pressing ENTER. Thus, if we
+		 * see anything other than 0x9c, we have to try again.
+		 *
+		 * Note, if you held some key as ENTER was depressed,
+		 * that break code would get leaked out.
+		 */
+		if (scancode != 0x9c)
+			continue;
 
-		return 13;
+		return;
 	}
-
-	return keychar & 0xff;
 }
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437..67b847dfa2b 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
 	if (KDB_STATE(DOING_SS))
 		KDB_STATE_CLEAR(SSBPT);
 
+	/* Clean up any keyboard devices before leaving */
+	kdb_kbd_cleanup_state();
+
 	return result;
 }
 
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40..47c4e56e513 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
 
 extern void kdb_set_current_task(struct task_struct *);
 extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
 #ifdef CONFIG_MODULES
 extern struct list_head *kdb_modules;
 #endif /* CONFIG_MODULES */
-- 
cgit v1.2.3-70-g09d2


From bec4d62ead8096e433d624d9339893f50badd992 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 19 Mar 2012 19:35:55 -0500
Subject: kgdb,debug_core: add the ability to control the reboot notifier

Sometimes it is desirable to stop the kernel debugger before allowing
a system to reboot either with kdb or kgdb.  This patch adds the
ability to turn the reboot notifier on and off or enter the debugger
and stop kernel execution before rebooting.

It is possible to change the setting after booting the kernel with the
following:

echo 1 > /sys/module/debug_core/parameters/kgdbreboot

It is also possible to change this setting using kdb / kgdb to
manipulate the variable directly.

Using KDB:
   mm kgdbreboot 1

Using gdb:
   set kgdbreboot=1

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 Documentation/DocBook/kgdb.tmpl | 17 +++++++++++++++++
 kernel/debug/debug_core.c       | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index d71b57fcf11..4ee4ba3509f 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -361,6 +361,23 @@
    <para>It is possible to use this option with kgdboc on a tty that is not a system console.
    </para>
   </para>
+  </sect1>
+   <sect1 id="kgdbreboot">
+   <title>Run time parameter: kgdbreboot</title>
+   <para> The kgdbreboot feature allows you to change how the debugger
+   deals with the reboot notification.  You have 3 choices for the
+   behavior.  The default behavior is always set to 0.</para>
+   <orderedlist>
+   <listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para>
+   <para>Ignore the reboot notification entirely.</para>
+   </listitem>
+   <listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para>
+   <para>Send the detach message to any attached debugger client.</para>
+   </listitem>
+   <listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para>
+   <para>Enter the debugger on reboot notify.</para>
+   </listitem>
+   </orderedlist>
   </sect1>
   </chapter>
   <chapter id="usingKDB">
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 3c1ad4e0354..3f88a45e6f0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -76,6 +76,8 @@ static int			exception_level;
 struct kgdb_io		*dbg_io_ops;
 static DEFINE_SPINLOCK(kgdb_registration_lock);
 
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
 /* kgdb console driver is loaded */
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
@@ -97,6 +99,7 @@ static int __init opt_kgdb_con(char *str)
 early_param("kgdbcon", opt_kgdb_con);
 
 module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
 
 /*
  * Holds information about breakpoints in a kernel. These breakpoints are
@@ -788,8 +791,21 @@ void __init dbg_late_init(void)
 static int
 dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
 {
+	/*
+	 * Take the following action on reboot notify depending on value:
+	 *    1 == Enter debugger
+	 *    0 == [the default] detatch debug client
+	 *   -1 == Do nothing... and use this until the board resets
+	 */
+	switch (kgdbreboot) {
+	case 1:
+		kgdb_breakpoint();
+	case -1:
+		goto done;
+	}
 	if (!dbg_kdb_mode)
 		gdbstub_exit(code);
+done:
 	return NOTIFY_DONE;
 }
 
-- 
cgit v1.2.3-70-g09d2


From b8adde8ddec9ff62a21564fa8020b5463e70d4de Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Wed, 21 Sep 2011 13:19:12 -0700
Subject: kdb: Avoid using dbg_io_ops until it is initialized

This fixes a bug with setting a breakpoint during kdb initialization
(from kdb_cmds).  Any call to kdb_printf() before the initialization
of the kgdboc serial console driver (which happens much later during
bootup than kdb_init), results in kernel panic due to the use of
dbg_io_ops before it is initialized.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e..9b5f17da1c5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
 	if (!dbg_kdb_mode && kgdb_connected) {
 		gdbstub_msg_write(kdb_buffer, retlen);
 	} else {
-		if (!dbg_io_ops->is_console) {
+		if (dbg_io_ops && !dbg_io_ops->is_console) {
 			len = strlen(kdb_buffer);
 			cp = kdb_buffer;
 			while (len--) {
-- 
cgit v1.2.3-70-g09d2


From 1ba0c1720eb0de2d0f3abf84c0b128d10af520d1 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 21 Sep 2011 13:07:47 -0700
Subject: kdb: Add message about CONFIG_DEBUG_RODATA on failure to install
 breakpoint

On x86, if CONFIG_DEBUG_RODATA is set, one cannot set breakpoints
via KDB.  Apparently this is a well-known problem, as at least one distribution
now ships with both KDB enabled and CONFIG_DEBUG_RODATA=y for security reasons.

This patch adds an printk message to the breakpoint failure case,
in order to provide suggestions about how to use the debugger.

Reported-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Tim Bird <tim.bird@am.sony.com>
---
 kernel/debug/kdb/kdb_bp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459..8418c2f8ec5 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
 	} else {
 		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
 			   __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+		if (!bp->bp_type) {
+			kdb_printf("Software breakpoints are unavailable.\n"
+				   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+				   "  OR use hw breaks: help bph\n");
+		}
+#endif
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From ebec18a6d3aa1e7d84aab16225e87fd25170ec2b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 23 Mar 2012 15:01:54 -0700
Subject: prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process
 supervision

Userspace service managers/supervisors need to track their started
services.  Many services daemonize by double-forking and get implicitly
re-parented to PID 1.  The service manager will no longer be able to
receive the SIGCHLD signals for them, and is no longer in charge of
reaping the children with wait().  All information about the children is
lost at the moment PID 1 cleans up the re-parented processes.

With this prctl, a service manager process can mark itself as a sort of
'sub-init', able to stay as the parent for all orphaned processes
created by the started services.  All SIGCHLD signals will be delivered
to the service manager.

Receiving SIGCHLD and doing wait() is in cases of a service-manager much
preferred over any possible asynchronous notification about specific
PIDs, because the service manager has full access to the child process
data in /proc and the PID can not be re-used until the wait(), the
service-manager itself is in charge of, has happened.

As a side effect, the relevant parent PID information does not get lost
by a double-fork, which results in a more elaborate process tree and
'ps' output:

before:
  # ps afx
  253 ?        Ss     0:00 /bin/dbus-daemon --system --nofork
  294 ?        Sl     0:00 /usr/libexec/polkit-1/polkitd
  328 ?        S      0:00 /usr/sbin/modem-manager
  608 ?        Sl     0:00 /usr/libexec/colord
  658 ?        Sl     0:00 /usr/libexec/upowerd
  819 ?        Sl     0:00 /usr/libexec/imsettings-daemon
  916 ?        Sl     0:00 /usr/libexec/udisks-daemon
  917 ?        S      0:00  \_ udisks-daemon: not polling any devices

after:
  # ps afx
  294 ?        Ss     0:00 /bin/dbus-daemon --system --nofork
  426 ?        Sl     0:00  \_ /usr/libexec/polkit-1/polkitd
  449 ?        S      0:00  \_ /usr/sbin/modem-manager
  635 ?        Sl     0:00  \_ /usr/libexec/colord
  705 ?        Sl     0:00  \_ /usr/libexec/upowerd
  959 ?        Sl     0:00  \_ /usr/libexec/udisks-daemon
  960 ?        S      0:00  |   \_ udisks-daemon: not polling any devices
  977 ?        Sl     0:00  \_ /usr/libexec/packagekitd

This prctl is orthogonal to PID namespaces.  PID namespaces are isolated
from each other, while a service management process usually requires the
services to live in the same namespace, to be able to talk to each
other.

Users of this will be the systemd per-user instance, which provides
init-like functionality for the user's login session and D-Bus, which
activates bus services on-demand.  Both need init-like capabilities to
be able to properly keep track of the services they start.

Many thanks to Oleg for several rounds of review and insights.

[akpm@linux-foundation.org: fix comment layout and spelling]
[akpm@linux-foundation.org: add lengthy code comment from Oleg]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  3 +++
 include/linux/sched.h | 12 ++++++++++++
 kernel/exit.c         | 33 ++++++++++++++++++++++++++++-----
 kernel/fork.c         |  3 +++
 kernel/sys.c          |  8 ++++++++
 5 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a0413ac3abe..e0cfec2490a 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -121,4 +121,7 @@
 #define PR_SET_PTRACER 0x59616d61
 # define PR_SET_PTRACER_ANY ((unsigned long)-1)
 
+#define PR_SET_CHILD_SUBREAPER 36
+#define PR_GET_CHILD_SUBREAPER 37
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c147a4260a..0c3854b0d4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,6 +553,18 @@ struct signal_struct {
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 
+	/*
+	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
+	 * manager, to re-parent orphan (double-forking) child processes
+	 * to this process instead of 'init'. The service manager is
+	 * able to receive SIGCHLD signals and is able to investigate
+	 * the process until it calls wait(). All children of this
+	 * process will inherit a flag if they should look for a
+	 * child_subreaper process at exit.
+	 */
+	unsigned int		is_child_subreaper:1;
+	unsigned int		has_child_subreaper:1;
+
 	/* POSIX.1b Interval Timers */
 	struct list_head posix_timers;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 16b07bfac22..456329fd4ea 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 }
 
 /*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ *    child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
  */
 static struct task_struct *find_new_reaper(struct task_struct *father)
 	__releases(&tasklist_lock)
@@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 		 * forget_original_parent() must move them somewhere.
 		 */
 		pid_ns->child_reaper = init_pid_ns.child_reaper;
+	} else if (father->signal->has_child_subreaper) {
+		struct task_struct *reaper;
+
+		/*
+		 * Find the first ancestor marked as child_subreaper.
+		 * Note that the code below checks same_thread_group(reaper,
+		 * pid_ns->child_reaper).  This is what we need to DTRT in a
+		 * PID namespace. However we still need the check above, see
+		 * http://marc.info/?l=linux-kernel&m=131385460420380
+		 */
+		for (reaper = father->real_parent;
+		     reaper != &init_task;
+		     reaper = reaper->real_parent) {
+			if (same_thread_group(reaper, pid_ns->child_reaper))
+				break;
+			if (!reaper->signal->is_child_subreaper)
+				continue;
+			thread = reaper;
+			do {
+				if (!(thread->flags & PF_EXITING))
+					return reaper;
+			} while_each_thread(reaper, thread);
+		}
 	}
 
 	return pid_ns->child_reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index 37674ec55cd..b9372a0bff1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
+	sig->has_child_subreaper = current->signal->has_child_subreaper ||
+				   current->signal->is_child_subreaper;
+
 	mutex_init(&sig->cred_guard_mutex);
 
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd19..9eb7fcab8df 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_MM:
 			error = prctl_set_mm(arg2, arg3, arg4, arg5);
 			break;
+		case PR_SET_CHILD_SUBREAPER:
+			me->signal->is_child_subreaper = !!arg2;
+			error = 0;
+			break;
+		case PR_GET_CHILD_SUBREAPER:
+			error = put_user(me->signal->is_child_subreaper,
+					 (int __user *) arg2);
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-70-g09d2


From 397a21f24d455982a8a6f9bc11b5f3326ce3c6ef Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 23 Mar 2012 15:01:54 -0700
Subject: kernel/exit.c: if init dies, log a signal which killed it, if any

I just received another user's pleas for help when their init
mysteriously died.  I again explained that they need to check whether it
died because of bad instruction, a segv, or something else.  Which was
an annoying detour into writing a trivial C program to spawn his init
and print its exit code:

  http://lists.busybox.net/pipermail/busybox/2012-January/077172.html

I hear you saying "just test it under /bin/sh".  Well, the crashing init
_was_ /bin/sh.

Which prompted me to make kernel do this first step automatically.  We can
print exit code, which makes it possible to see that death was from e.g.
SIGILL without writing test programs.

[akpm@linux-foundation.org: add 0x to hex number output]
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 456329fd4ea..3db1909faed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 
 	if (unlikely(pid_ns->child_reaper == father)) {
 		write_unlock_irq(&tasklist_lock);
-		if (unlikely(pid_ns == &init_pid_ns))
-			panic("Attempted to kill init!");
+		if (unlikely(pid_ns == &init_pid_ns)) {
+			panic("Attempted to kill init! exitcode=0x%08x\n",
+				father->signal->group_exit_code ?:
+					father->exit_code);
+		}
 
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
-- 
cgit v1.2.3-70-g09d2


From 7a05c0f7bbae91d08b7d0acf016fdb42dbc912ae Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Fri, 23 Mar 2012 15:01:55 -0700
Subject: watchdog: make sure the watchdog thread gets CPU on loaded system

If the system is loaded while hotplugging a CPU we might end up with a
bogus hardlockup detection.  This has been seen during LTP pounder test
executed in parallel with hotplug test.

The main problem is that enable_watchdog (called when CPU is brought up)
registers perf event which periodically checks per-cpu counter
(hrtimer_interrupts), updated from a hrtimer callback, but the hrtimer
is fired from the kernel thread.

This means that while we already do check for the hard lockup the kernel
thread might be sitting on the runqueue with zillions of tasks so there
is nobody to update the value we rely on and so we KABOOM.

Let's fix this by boosting the watchdog thread priority before we wake
it up rather than when it's already running.  This still doesn't handle
a case where we have the same amount of high prio FIFO tasks but that
doesn't seem to be common.  The current implementation doesn't handle
that case anyway so this is not worse at least.

Unfortunately, we cannot start perf counter from the watchdog thread
because we could miss a real lock up and also we cannot start the
hrtimer watchdog_enable because we there is no way (at least I don't
know any) to start a hrtimer from a different CPU.

[dzickus@redhat.com: fix compile issue with param]
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Mandeep Singh Baines <msb@chromium.org>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14bc092fb12..203fc6e1a28 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -319,11 +319,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  */
 static int watchdog(void *unused)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct sched_param param = { .sched_priority = 0 };
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
-	sched_setscheduler(current, SCHED_FIFO, &param);
-
 	/* initialize timestamp */
 	__touch_watchdog();
 
@@ -350,7 +348,6 @@ static int watchdog(void *unused)
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
-	param.sched_priority = 0;
 	sched_setscheduler(current, SCHED_NORMAL, &param);
 	return 0;
 }
@@ -439,6 +436,7 @@ static int watchdog_enable(int cpu)
 
 	/* create the watchdog thread */
 	if (!p) {
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
@@ -450,6 +448,7 @@ static int watchdog_enable(int cpu)
 			}
 			goto out;
 		}
+		sched_setscheduler(p, SCHED_FIFO, &param);
 		kthread_bind(p, cpu);
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 		per_cpu(softlockup_watchdog, cpu) = p;
-- 
cgit v1.2.3-70-g09d2


From 4501980aae221ed8120dee3491f799ecd75187ad Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 23 Mar 2012 15:01:55 -0700
Subject: kernel/watchdog.c: convert to pr_foo()

It fixes some 80-col wordwrappings and adds some consistency.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 203fc6e1a28..a01cb03b045 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,6 +9,8 @@
  * to those contributors as well.
  */
 
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/nmi.h>
@@ -373,18 +375,20 @@ static int watchdog_nmi_enable(int cpu)
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
 	if (!IS_ERR(event)) {
-		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+		pr_info("enabled, takes one hw-pmu counter.\n");
 		goto out_save;
 	}
 
 
 	/* vary the KERN level based on the returned errno */
 	if (PTR_ERR(event) == -EOPNOTSUPP)
-		printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
 	else if (PTR_ERR(event) == -ENOENT)
-		printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+		pr_warning("disabled (cpu%i): hardware events not enabled\n",
+			 cpu);
 	else
-		printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+			cpu, PTR_ERR(event));
 	return PTR_ERR(event);
 
 	/* success path */
@@ -439,7 +443,7 @@ static int watchdog_enable(int cpu)
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
-			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+			pr_err("softlockup watchdog for %i failed\n", cpu);
 			if (!err) {
 				/* if hardlockup hasn't already set this */
 				err = PTR_ERR(p);
@@ -495,7 +499,7 @@ static void watchdog_enable_all_cpus(void)
 			watchdog_enabled = 1;
 
 	if (!watchdog_enabled)
-		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+		pr_err("failed to be enabled on some cpus\n");
 
 }
 
-- 
cgit v1.2.3-70-g09d2


From b60f796c4ca72545327a069f12938360d833cce7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 23 Mar 2012 15:01:56 -0700
Subject: kernel/watchdog.c: add comment to watchdog() exit path

Revelation from Peter.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@tglx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a01cb03b045..df30ee08bdd 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -349,6 +349,10 @@ static int watchdog(void *unused)
 
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
+	/*
+	 * Drop the policy/priority elevation during thread exit to avoid a
+	 * scheduling latency spike.
+	 */
 	__set_current_state(TASK_RUNNING);
 	sched_setscheduler(current, SCHED_NORMAL, &param);
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 8c5cf9e5c50dc902713897e10201aa71f3546aa1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 23 Mar 2012 15:02:40 -0700
Subject: ptrace: don't modify flags on PTRACE_SETOPTIONS failure

On ptrace(PTRACE_SETOPTIONS, pid, 0, <opts>), we used to set those
option bits which are known, and then fail with -EINVAL if there are
some unknown bits in <opts>.

This is inconsistent with typical error handling, which does not change
any state if input is invalid.

This patch changes PTRACE_SETOPTIONS behavior so that in this case, we
return -EINVAL and don't change any bits in task->ptrace.

It's very unlikely that there is userspace code in the wild which will
be affected by this change: it should have the form

    ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_BOGUSOPT)

where PTRACE_O_BOGUSOPT is a constant unknown to the kernel.  But kernel
headers, naturally, don't contain any PTRACE_O_BOGUSOPTs, thus the only
way userspace can use one if it defines one itself.  I can't see why
anyone would do such a thing deliberately.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed1..273f56ea39d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -528,6 +528,9 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
+	if (data & ~(unsigned long)PTRACE_O_MASK)
+		return -EINVAL;
+
 	child->ptrace &= ~PT_TRACE_MASK;
 
 	if (data & PTRACE_O_TRACESYSGOOD)
@@ -551,7 +554,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 	if (data & PTRACE_O_TRACEEXIT)
 		child->ptrace |= PT_TRACE_EXIT;
 
-	return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+	return 0;
 }
 
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
-- 
cgit v1.2.3-70-g09d2


From 86b6c1f301faf085de5a3f9ce16b8de6e69c729b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 23 Mar 2012 15:02:41 -0700
Subject: ptrace: simplify PTRACE_foo constants and PTRACE_SETOPTIONS code

Exchange PT_TRACESYSGOOD and PT_PTRACE_CAP bit positions, which makes
PT_option bits contiguous and therefore makes code in
ptrace_setoptions() much simpler.

Every PTRACE_O_TRACEevent is defined to (1 << PTRACE_EVENT_event)
instead of using explicit numeric constants, to ensure we don't mess up
relationship between bit positions and event ids.

PT_EVENT_FLAG_SHIFT was not particularly useful, PT_OPT_FLAG_SHIFT with
value of PT_EVENT_FLAG_SHIFT-1 is easier to use.

PT_TRACE_MASK constant is nuked, the only its use is replaced by
(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT).

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 33 +++++++++++++++------------------
 kernel/ptrace.c        | 31 ++++++++-----------------------
 2 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 6fdb196caa3..6f1260ee5be 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -54,17 +54,6 @@
 /* flags in @data for PTRACE_SEIZE */
 #define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
 
-/* options set using PTRACE_SETOPTIONS */
-#define PTRACE_O_TRACESYSGOOD	0x00000001
-#define PTRACE_O_TRACEFORK	0x00000002
-#define PTRACE_O_TRACEVFORK	0x00000004
-#define PTRACE_O_TRACECLONE	0x00000008
-#define PTRACE_O_TRACEEXEC	0x00000010
-#define PTRACE_O_TRACEVFORKDONE	0x00000020
-#define PTRACE_O_TRACEEXIT	0x00000040
-
-#define PTRACE_O_MASK		0x0000007f
-
 /* Wait extended result codes for the above trace options.  */
 #define PTRACE_EVENT_FORK	1
 #define PTRACE_EVENT_VFORK	2
@@ -74,6 +63,17 @@
 #define PTRACE_EVENT_EXIT	6
 #define PTRACE_EVENT_STOP	7
 
+/* options set using PTRACE_SETOPTIONS */
+#define PTRACE_O_TRACESYSGOOD	1
+#define PTRACE_O_TRACEFORK	(1 << PTRACE_EVENT_FORK)
+#define PTRACE_O_TRACEVFORK	(1 << PTRACE_EVENT_VFORK)
+#define PTRACE_O_TRACECLONE	(1 << PTRACE_EVENT_CLONE)
+#define PTRACE_O_TRACEEXEC	(1 << PTRACE_EVENT_EXEC)
+#define PTRACE_O_TRACEVFORKDONE	(1 << PTRACE_EVENT_VFORK_DONE)
+#define PTRACE_O_TRACEEXIT	(1 << PTRACE_EVENT_EXIT)
+
+#define PTRACE_O_MASK		0x0000007f
+
 #include <asm/ptrace.h>
 
 #ifdef __KERNEL__
@@ -88,13 +88,12 @@
 #define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
 #define PT_PTRACED	0x00000001
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
-#define PT_TRACESYSGOOD	0x00000004
-#define PT_PTRACE_CAP	0x00000008	/* ptracer can follow suid-exec */
+#define PT_PTRACE_CAP	0x00000004	/* ptracer can follow suid-exec */
 
+#define PT_OPT_FLAG_SHIFT	3
 /* PT_TRACE_* event enable flags */
-#define PT_EVENT_FLAG_SHIFT	4
-#define PT_EVENT_FLAG(event)	(1 << (PT_EVENT_FLAG_SHIFT + (event) - 1))
-
+#define PT_EVENT_FLAG(event)	(1 << (PT_OPT_FLAG_SHIFT + (event)))
+#define PT_TRACESYSGOOD		PT_EVENT_FLAG(0)
 #define PT_TRACE_FORK		PT_EVENT_FLAG(PTRACE_EVENT_FORK)
 #define PT_TRACE_VFORK		PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
 #define PT_TRACE_CLONE		PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
@@ -102,8 +101,6 @@
 #define PT_TRACE_VFORK_DONE	PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
 #define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
 
-#define PT_TRACE_MASK	0x000003f4
-
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
 #define PT_SINGLESTEP		(1<<PT_SINGLESTEP_BIT)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 273f56ea39d..9acd07a6f5b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -262,7 +262,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	/*
 	 * Protect exec's credential calculations against our interference;
-	 * interference; SUID, SGID and LSM creds get determined differently
+	 * SUID, SGID and LSM creds get determined differently
 	 * under ptrace.
 	 */
 	retval = -ERESTARTNOINTR;
@@ -528,31 +528,16 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
+	unsigned flags;
+
 	if (data & ~(unsigned long)PTRACE_O_MASK)
 		return -EINVAL;
 
-	child->ptrace &= ~PT_TRACE_MASK;
-
-	if (data & PTRACE_O_TRACESYSGOOD)
-		child->ptrace |= PT_TRACESYSGOOD;
-
-	if (data & PTRACE_O_TRACEFORK)
-		child->ptrace |= PT_TRACE_FORK;
-
-	if (data & PTRACE_O_TRACEVFORK)
-		child->ptrace |= PT_TRACE_VFORK;
-
-	if (data & PTRACE_O_TRACECLONE)
-		child->ptrace |= PT_TRACE_CLONE;
-
-	if (data & PTRACE_O_TRACEEXEC)
-		child->ptrace |= PT_TRACE_EXEC;
-
-	if (data & PTRACE_O_TRACEVFORKDONE)
-		child->ptrace |= PT_TRACE_VFORK_DONE;
-
-	if (data & PTRACE_O_TRACEEXIT)
-		child->ptrace |= PT_TRACE_EXIT;
+	/* Avoid intermediate state when all opts are cleared */
+	flags = child->ptrace;
+	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+	flags |= (data << PT_OPT_FLAG_SHIFT);
+	child->ptrace = flags;
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From aa9147c98f27550bd39416eca5a5844e54bced26 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 23 Mar 2012 15:02:42 -0700
Subject: ptrace: make PTRACE_SEIZE set ptrace options specified in 'data'
 parameter

This can be used to close a few corner cases in strace where we get
unwanted racy behavior after attach, but before we have a chance to set
options (the notorious post-execve SIGTRAP comes to mind), and removes
the need to track "did we set opts for this task" state in strace
internals.

While we are at it:

Make it possible to extend SEIZE in the future with more functionality
by passing non-zero 'addr' parameter.  To that end, error out if 'addr'
is non-zero.  PTRACE_ATTACH did not (and still does not) have such
check, and users (strace) do pass garbage there...  let's avoid
repeating this mistake with SEIZE.

Set all task->ptrace bits in one operation - before this change, we were
adding PT_SEIZED and PT_PTRACE_CAP with task->ptrace |= BIT ops.  This
was probably ok (not a bug), but let's be on a safer side.

Changes since v2: use (unsigned long) casts instead of (long) ones, move
PTRACE_SEIZE_DEVEL-related code to separate lines of code.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Pedro Alves <palves@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9acd07a6f5b..4661c5bc07e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,6 +231,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 }
 
 static int ptrace_attach(struct task_struct *task, long request,
+			 unsigned long addr,
 			 unsigned long flags)
 {
 	bool seize = (request == PTRACE_SEIZE);
@@ -238,19 +239,29 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	/*
 	 * SEIZE will enable new ptrace behaviors which will be implemented
-	 * gradually.  SEIZE_DEVEL is used to prevent applications
+	 * gradually.  SEIZE_DEVEL bit is used to prevent applications
 	 * expecting full SEIZE behaviors trapping on kernel commits which
 	 * are still in the process of implementing them.
 	 *
 	 * Only test programs for new ptrace behaviors being implemented
 	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
 	 *
-	 * Once SEIZE behaviors are completely implemented, this flag and
-	 * the following test will be removed.
+	 * Once SEIZE behaviors are completely implemented, this flag
+	 * will be removed.
 	 */
 	retval = -EIO;
-	if (seize && !(flags & PTRACE_SEIZE_DEVEL))
-		goto out;
+	if (seize) {
+		if (addr != 0)
+			goto out;
+		if (!(flags & PTRACE_SEIZE_DEVEL))
+			goto out;
+		flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL;
+		if (flags & ~(unsigned long)PTRACE_O_MASK)
+			goto out;
+		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+	} else {
+		flags = PT_PTRACED;
+	}
 
 	audit_ptrace(task);
 
@@ -282,11 +293,11 @@ static int ptrace_attach(struct task_struct *task, long request,
 	if (task->ptrace)
 		goto unlock_tasklist;
 
-	task->ptrace = PT_PTRACED;
 	if (seize)
-		task->ptrace |= PT_SEIZED;
+		flags |= PT_SEIZED;
 	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
-		task->ptrace |= PT_PTRACE_CAP;
+		flags |= PT_PTRACE_CAP;
+	task->ptrace = flags;
 
 	__ptrace_link(task, current);
 
@@ -879,7 +890,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 	}
 
 	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-		ret = ptrace_attach(child, request, data);
+		ret = ptrace_attach(child, request, addr, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
@@ -1022,7 +1033,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	}
 
 	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-		ret = ptrace_attach(child, request, data);
+		ret = ptrace_attach(child, request, addr, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
-- 
cgit v1.2.3-70-g09d2


From ee00560c7dac1dbbf048446a8489550d0a5765b7 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 23 Mar 2012 15:02:43 -0700
Subject: ptrace: remove PTRACE_SEIZE_DEVEL bit

PTRACE_SEIZE code is tested and ready for production use, remove the
code which requires special bit in data argument to make PTRACE_SEIZE
work.

Strace team prepares for a new release of strace, and we would like to
ship the code which uses PTRACE_SEIZE, preferably after this change goes
into released kernel.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  5 +----
 kernel/ptrace.c        | 15 ---------------
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 30be18064df..407c678d2e3 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -51,9 +51,6 @@
 #define PTRACE_INTERRUPT	0x4207
 #define PTRACE_LISTEN		0x4208
 
-/* flags in @data for PTRACE_SEIZE */
-#define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
-
 /* Wait extended result codes for the above trace options.  */
 #define PTRACE_EVENT_FORK	1
 #define PTRACE_EVENT_VFORK	2
@@ -64,7 +61,7 @@
 /* Extended result codes which enabled by means other than options.  */
 #define PTRACE_EVENT_STOP	128
 
-/* options set using PTRACE_SETOPTIONS */
+/* Options set using PTRACE_SETOPTIONS or using PTRACE_SEIZE @data param */
 #define PTRACE_O_TRACESYSGOOD	1
 #define PTRACE_O_TRACEFORK	(1 << PTRACE_EVENT_FORK)
 #define PTRACE_O_TRACEVFORK	(1 << PTRACE_EVENT_VFORK)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4661c5bc07e..ee8d49b9c30 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,25 +237,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 	bool seize = (request == PTRACE_SEIZE);
 	int retval;
 
-	/*
-	 * SEIZE will enable new ptrace behaviors which will be implemented
-	 * gradually.  SEIZE_DEVEL bit is used to prevent applications
-	 * expecting full SEIZE behaviors trapping on kernel commits which
-	 * are still in the process of implementing them.
-	 *
-	 * Only test programs for new ptrace behaviors being implemented
-	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
-	 *
-	 * Once SEIZE behaviors are completely implemented, this flag
-	 * will be removed.
-	 */
 	retval = -EIO;
 	if (seize) {
 		if (addr != 0)
 			goto out;
-		if (!(flags & PTRACE_SEIZE_DEVEL))
-			goto out;
-		flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL;
 		if (flags & ~(unsigned long)PTRACE_O_MASK)
 			goto out;
 		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
-- 
cgit v1.2.3-70-g09d2


From 629d362b9950166c6fac2aa8425db34d824bb043 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:44 -0700
Subject: signal: give SEND_SIG_FORCED more power to beat SIGNAL_UNKILLABLE

force_sig_info() and friends have the special semantics for synchronous
signals, this interface should not be used if the target is not current.
And it needs the fixes, in particular the clearing of SIGNAL_UNKILLABLE
is not exactly right.

However there are callers which have to use force_ exactly because it
clears SIGNAL_UNKILLABLE and thus it can kill the CLONE_NEWPID tasks,
although this is almost always is wrong by various reasons.

With this patch SEND_SIG_FORCED ignores SIGNAL_UNKILLABLE, like we do if
the signal comes from the ancestor namespace.

This makes the naming in prepare_signal() paths insane, fixed by the
next cleanup.

Note: this only affects SIGKILL/SIGSTOP, but this is enough for
force_sig() abusers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index e76001ccf5c..2584f5a91fb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	assert_spin_locked(&t->sighand->siglock);
 
 	result = TRACE_SIGNAL_IGNORED;
-	if (!prepare_signal(sig, t, from_ancestor_ns))
+	if (!prepare_signal(sig, t,
+			from_ancestor_ns || (info == SEND_SIG_FORCED)))
 		goto ret;
 
 	pending = group ? &t->signal->shared_pending : &t->pending;
-- 
cgit v1.2.3-70-g09d2


From def8cf72562e17ec8316ce0cb5697c7afd6400e3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:45 -0700
Subject: signal: cosmetic, s/from_ancestor_ns/force/ in prepare_signal() paths

Cosmetic, rename the from_ancestor_ns argument in prepare_signal()
paths.  After the previous change it doesn't match the reality.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2584f5a91fb..d523da02dd1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_task_ignored(struct task_struct *t, int sig,
-		int from_ancestor_ns)
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 {
 	void __user *handler;
 
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL && !from_ancestor_ns)
+			handler == SIG_DFL && !force)
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
 }
 
-static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+static int sig_ignored(struct task_struct *t, int sig, bool force)
 {
 	/*
 	 * Blocked signals are never ignored, since the
@@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig, from_ancestor_ns))
+	if (!sig_task_ignored(t, sig, force))
 		return 0;
 
 	/*
@@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t)
  * Returns true if the signal should be actually delivered, otherwise
  * it should be dropped.
  */
-static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
+static int prepare_signal(int sig, struct task_struct *p, bool force)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		}
 	}
 
-	return !sig_ignored(p, sig, from_ancestor_ns);
+	return !sig_ignored(p, sig, force);
 }
 
 /*
@@ -1602,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 
 	ret = 1; /* the signal is ignored */
 	result = TRACE_SIGNAL_IGNORED;
-	if (!prepare_signal(sig, t, 0))
+	if (!prepare_signal(sig, t, false))
 		goto out;
 
 	ret = 0;
-- 
cgit v1.2.3-70-g09d2


From a02d6fd643cbd4c559113b35b31d3b04e4ec60c7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:46 -0700
Subject: signal: zap_pid_ns_processes: s/SEND_SIG_NOINFO/SEND_SIG_FORCED/

Change zap_pid_ns_processes() to use SEND_SIG_FORCED, it looks more
clear compared to SEND_SIG_NOINFO which relies on from_ancestor_ns logic
send_signal().

It is also more efficient if we need to kill a lot of tasks because it
doesn't alloc sigqueue.

While at it, add the __fatal_signal_pending(task) check as a minor
optimization.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046..17b232869a0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	while (nr > 0) {
 		rcu_read_lock();
 
-		/*
-		 * Any nested-container's init processes won't ignore the
-		 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-		 */
 		task = pid_task(find_vpid(nr), PIDTYPE_PID);
-		if (task)
-			send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
+		if (task && !__fatal_signal_pending(task))
+			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
 
 		rcu_read_unlock();
 
-- 
cgit v1.2.3-70-g09d2


From b3449922502f5a161ee2b5022a33aec8472fbf18 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:47 -0700
Subject: usermodehelper: introduce umh_complete(sub_info)

Preparation.  Add the new trivial helper, umh_complete().  Currently it
simply does complete(sub_info->complete).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934..8ea25944ce3 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -199,6 +199,11 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 
+static void umh_complete(struct subprocess_info *sub_info)
+{
+	complete(sub_info->complete);
+}
+
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -235,7 +240,7 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	complete(sub_info->complete);
+	umh_complete(sub_info);
 	return 0;
 }
 
@@ -269,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work)
 	case UMH_WAIT_EXEC:
 		if (pid < 0)
 			sub_info->retval = pid;
-		complete(sub_info->complete);
+		umh_complete(sub_info);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From d0bd587a80960d7ba7e0c8396e154028c9045c54 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:47 -0700
Subject: usermodehelper: implement UMH_KILLABLE

Implement UMH_KILLABLE, should be used along with UMH_WAIT_EXEC/PROC.
The caller must ensure that subprocess_info->path/etc can not go away
until call_usermodehelper_freeinfo().

call_usermodehelper_exec(UMH_KILLABLE) does
wait_for_completion_killable.  If it fails, it uses
xchg(&sub_info->complete, NULL) to serialize with umh_complete() which
does the same xhcg() to access sub_info->complete.

If call_usermodehelper_exec wins, it can safely return.  umh_complete()
should get NULL and call call_usermodehelper_freeinfo().

Otherwise we know that umh_complete() was already called, in this case
call_usermodehelper_exec() falls back to wait_for_completion() which
should succeed "very soon".

Note: UMH_NO_WAIT == -1 but it obviously should not be used with
UMH_KILLABLE.  We delay the neccessary cleanup to simplify the back
porting.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h |  2 ++
 kernel/kmod.c        | 27 +++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 722f477c4ef..1b5985855ff 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -54,6 +54,8 @@ enum umh_wait {
 	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
 };
 
+#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
+
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8ea25944ce3..f92f917c450 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -201,7 +201,15 @@ EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 
 static void umh_complete(struct subprocess_info *sub_info)
 {
-	complete(sub_info->complete);
+	struct completion *comp = xchg(&sub_info->complete, NULL);
+	/*
+	 * See call_usermodehelper_exec(). If xchg() returns NULL
+	 * we own sub_info, the UMH_KILLABLE caller has gone away.
+	 */
+	if (comp)
+		complete(comp);
+	else
+		call_usermodehelper_freeinfo(sub_info);
 }
 
 /* Keventd can't block, but this (a child) can. */
@@ -252,6 +260,9 @@ static void __call_usermodehelper(struct work_struct *work)
 	enum umh_wait wait = sub_info->wait;
 	pid_t pid;
 
+	if (wait != UMH_NO_WAIT)
+		wait &= ~UMH_KILLABLE;
+
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -461,9 +472,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	queue_work(khelper_wq, &sub_info->work);
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
+
+	if (wait & UMH_KILLABLE) {
+		retval = wait_for_completion_killable(&done);
+		if (!retval)
+			goto wait_done;
+
+		/* umh_complete() will see NULL and free sub_info */
+		if (xchg(&sub_info->complete, NULL))
+			goto unlock;
+		/* fallthrough, umh_complete() was already called */
+	}
+
 	wait_for_completion(&done);
+wait_done:
 	retval = sub_info->retval;
-
 out:
 	call_usermodehelper_freeinfo(sub_info);
 unlock:
-- 
cgit v1.2.3-70-g09d2


From 9d944ef32e83405a07376f112e9f02161d3e9731 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:48 -0700
Subject: usermodehelper: kill umh_wait, renumber UMH_* constants

No functional changes.  It is not sane to use UMH_KILLABLE with enum
umh_wait, but obviously we do not want another argument in
call_usermodehelper_* helpers.  Kill this enum, use the plain int.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h        | 18 +++++++-----------
 kernel/kmod.c               |  8 ++------
 security/keys/request_key.c |  2 +-
 3 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 1b5985855ff..9efeae67910 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -48,12 +48,9 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
 struct cred;
 struct file;
 
-enum umh_wait {
-	UMH_NO_WAIT = -1,	/* don't wait at all */
-	UMH_WAIT_EXEC = 0,	/* wait for the exec, but not the process */
-	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
-};
-
+#define UMH_NO_WAIT	0	/* don't wait at all */
+#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
+#define UMH_WAIT_PROC	2	/* wait for the process to complete */
 #define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
 
 struct subprocess_info {
@@ -62,7 +59,7 @@ struct subprocess_info {
 	char *path;
 	char **argv;
 	char **envp;
-	enum umh_wait wait;
+	int wait;
 	int retval;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
@@ -80,15 +77,14 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
 		    void *data);
 
 /* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
+int call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
 /* Free the subprocess_info. This is only needed if you're not going
    to call call_usermodehelper_exec */
 void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
-call_usermodehelper_fns(char *path, char **argv, char **envp,
-			enum umh_wait wait,
+call_usermodehelper_fns(char *path, char **argv, char **envp, int wait,
 			int (*init)(struct subprocess_info *info, struct cred *new),
 			void (*cleanup)(struct subprocess_info *), void *data)
 {
@@ -106,7 +102,7 @@ call_usermodehelper_fns(char *path, char **argv, char **envp,
 }
 
 static inline int
-call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
+call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
 	return call_usermodehelper_fns(path, argv, envp, wait,
 				       NULL, NULL, NULL);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f92f917c450..8341de91613 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -257,12 +257,9 @@ static void __call_usermodehelper(struct work_struct *work)
 {
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
-	enum umh_wait wait = sub_info->wait;
+	int wait = sub_info->wait & ~UMH_KILLABLE;
 	pid_t pid;
 
-	if (wait != UMH_NO_WAIT)
-		wait &= ~UMH_KILLABLE;
-
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -451,8 +448,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
-			     enum umh_wait wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 82465328c39..cc3790315d2 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -91,7 +91,7 @@ static void umh_keys_cleanup(struct subprocess_info *info)
  * Call a usermode helper with a specific session keyring.
  */
 static int call_usermodehelper_keys(char *path, char **argv, char **envp,
-			 struct key *session_keyring, enum umh_wait wait)
+					struct key *session_keyring, int wait)
 {
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 	struct subprocess_info *info =
-- 
cgit v1.2.3-70-g09d2


From 5b9bd473e3b8a8c6c4ae99be475e6e9b27568555 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:49 -0700
Subject: usermodehelper: ____call_usermodehelper() doesn't need do_exit()

Minor cleanup.  ____call_usermodehelper() can simply return, no need to
call do_exit() explicitely.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8341de91613..685b246b13b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -188,7 +188,7 @@ static int ____call_usermodehelper(void *data)
 	/* Exec failed? */
 fail:
 	sub_info->retval = retval;
-	do_exit(0);
+	return 0;
 }
 
 void call_usermodehelper_freeinfo(struct subprocess_info *info)
-- 
cgit v1.2.3-70-g09d2


From 3e63a93b987685f02421e18b2aa452d20553a88b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:49 -0700
Subject: kmod: introduce call_modprobe() helper

No functional changes.  Move the call_usermodehelper code from
__request_module() into the new simple helper, call_modprobe().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 685b246b13b..56a29e812ff 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,21 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
+static int call_modprobe(char *module_name, int wait)
+{
+	static char *envp[] = {
+		"HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+
+	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+
+	return call_usermodehelper_fns(modprobe_path, argv, envp,
+					wait, NULL, NULL, NULL);
+}
+
 /**
  * __request_module - try to load a kernel module
  * @wait: wait (or not) for the operation to complete
@@ -81,11 +96,6 @@ int __request_module(bool wait, const char *fmt, ...)
 	char module_name[MODULE_NAME_LEN];
 	unsigned int max_modprobes;
 	int ret;
-	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
-	static char *envp[] = { "HOME=/",
-				"TERM=linux",
-				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-				NULL };
 	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
@@ -128,9 +138,7 @@ int __request_module(bool wait, const char *fmt, ...)
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
-	ret = call_usermodehelper_fns(modprobe_path, argv, envp,
-			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
-			NULL, NULL, NULL);
+	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 
 	atomic_dec(&kmod_concurrent);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 1cc684ab75123efe7ff446eb821d44375ba8fa30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 23 Mar 2012 15:02:50 -0700
Subject: kmod: make __request_module() killable

As Tetsuo Handa pointed out, request_module() can stress the system
while the oom-killed caller sleeps in TASK_UNINTERRUPTIBLE.

The task T uses "almost all" memory, then it does something which
triggers request_module().  Say, it can simply call sys_socket().  This
in turn needs more memory and leads to OOM.  oom-killer correctly
chooses T and kills it, but this can't help because it sleeps in
TASK_UNINTERRUPTIBLE and after that oom-killer becomes "disabled" by the
TIF_MEMDIE task T.

Make __request_module() killable.  The only necessary change is that
call_modprobe() should kmalloc argv and module_name, they can't live in
the stack if we use UMH_KILLABLE.  This memory is freed via
call_usermodehelper_freeinfo()->cleanup.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56a29e812ff..957a7aab8eb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,12 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+	kfree(info->argv[3]); /* check call_modprobe() */
+	kfree(info->argv);
+}
+
 static int call_modprobe(char *module_name, int wait)
 {
 	static char *envp[] = {
@@ -69,10 +75,26 @@ static int call_modprobe(char *module_name, int wait)
 		NULL
 	};
 
-	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+	char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+	if (!argv)
+		goto out;
+
+	module_name = kstrdup(module_name, GFP_KERNEL);
+	if (!module_name)
+		goto free_argv;
+
+	argv[0] = modprobe_path;
+	argv[1] = "-q";
+	argv[2] = "--";
+	argv[3] = module_name;	/* check free_modprobe_argv() */
+	argv[4] = NULL;
 
 	return call_usermodehelper_fns(modprobe_path, argv, envp,
-					wait, NULL, NULL, NULL);
+		wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+free_argv:
+	kfree(argv);
+out:
+	return -ENOMEM;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2