aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c587
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/cpuset.c107
-rw-r--r--kernel/cred.c8
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c92
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/proc.c55
-rw-r--r--kernel/jump_label.c18
-rw-r--r--kernel/kmod.c100
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/mutex.c25
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c46
-rw-r--r--kernel/pm_qos_params.c70
-rw-r--r--kernel/posix-timers.c25
-rw-r--r--kernel/power/hibernate.c220
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcutree.c164
-rw-r--r--kernel/rcutree.h30
-rw-r--r--kernel/rcutree_plugin.h24
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c94
-rw-r--r--kernel/sched_fair.c5
-rw-r--r--kernel/sched_rt.c10
-rw-r--r--kernel/sched_stats.h4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/trace/ftrace.c31
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.h15
-rw-r--r--kernel/trace/trace_events.c7
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/utsname.c39
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/workqueue.c4
41 files changed, 1328 insertions, 800 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e9cf19155b4..2d64cfcc8b4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/capability.c b/kernel/capability.c
index 32a80e08ff4..283c529f8b1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,12 +22,8 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
int file_caps_enabled = 1;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a35510af..2731d115d72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
#include <asm/atomic.h>
@@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(ss, cgrp, tsk);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
+ if (ss->can_attach_task) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &newcg->tasks);
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1829,7 +1881,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(ss, cgrp, tsk);
}
}
return retval;
@@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ bool cancel_failed_ss = false;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroup *oldcgrp = NULL;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct flex_array *group;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - threadgroup_fork_lock prevents new threads from appearing,
+ * and if threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+ GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
+
+ /* prevent changes to the threadgroup list while we take a snapshot. */
+ rcu_read_lock();
+ if (!thread_group_leader(leader)) {
+ /*
+ * a race with de_thread from another thread's exec() may strip
+ * us of our leadership, making while_each_thread unsafe to use
+ * on this task. if this happens, there is no choice but to
+ * throw this task away and try again (from cgroup_procs_write);
+ * this is "double-double-toil-and-trouble-check locking".
+ */
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ goto out_free_group_list;
+ }
+ /* take a reference on each task in the group to go in the array. */
+ tsk = leader;
+ i = 0;
+ do {
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ get_task_struct(tsk);
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ BUG_ON(retval != 0);
+ i++;
+ } while_each_thread(leader, tsk);
+ /* remember the number of threads in the array for later. */
+ group_size = i;
+ rcu_read_unlock();
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run on each task in the threadgroup. */
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ cancel_failed_ss = true;
+ goto out_cancel_attach;
+ }
+ }
+ }
+ }
+
+ /*
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto out_list_teardown;
+ }
+ }
+
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup, calling ss->attach_task for each
+ * one along the way. there are no failure cases after here, so this is
+ * the commit point.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* if the thread is PF_EXITING, it can just get skipped. */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() after this point. */
+
+ /*
+ * step 4: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 5: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+out_list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out_cancel_attach:
+ /* same deal as in cgroup_attach_task */
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss) {
+ if (cancel_failed_ss && ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ break;
+ }
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ /* clean up the array of referenced threads in the group. */
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ put_task_struct(tsk);
+ }
+out_free_group_list:
+ flex_array_free(group);
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * RCU protects this access, since tsk was found in the
+ * tid map. a race with de_thread may cause group_leader
+ * to stop being the leader, but cgroup_attach_proc will
+ * detect it later.
+ */
+ tsk = tsk->group_leader;
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
+ rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup) {
+ threadgroup_fork_write_lock(tsk);
+ ret = cgroup_attach_proc(cgrp, tsk);
+ threadgroup_fork_write_unlock(tsk);
+ } else {
+ ret = cgroup_attach_task(cgrp, tsk);
+ }
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3259,9 +3632,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
- char *nodename)
-{
- struct dentry *dentry;
- int ret = 0;
- struct cgroup *parent, *child;
- struct inode *inode;
- struct css_set *cg;
- struct cgroupfs_root *root;
- struct cgroup_subsys *ss;
-
- /* We shouldn't be called by an unregistered subsystem */
- BUG_ON(!subsys->active);
-
- /* First figure out what hierarchy and cgroup we're dealing
- * with, and pin them so we can drop cgroup_mutex */
- mutex_lock(&cgroup_mutex);
- again:
- root = subsys->root;
- if (root == &rootnode) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Pin the hierarchy */
- if (!atomic_inc_not_zero(&root->sb->s_active)) {
- /* We race with the final deactivate_super() */
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Keep the cgroup alive */
- task_lock(tsk);
- parent = task_cgroup(tsk, subsys->subsys_id);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
-
- mutex_unlock(&cgroup_mutex);
-
- /* Now do the VFS work to create a cgroup */
- inode = parent->dentry->d_inode;
-
- /* Hold the parent directory mutex across this operation to
- * stop anyone else deleting the new cgroup */
- mutex_lock(&inode->i_mutex);
- dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
- if (IS_ERR(dentry)) {
- printk(KERN_INFO
- "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
- PTR_ERR(dentry));
- ret = PTR_ERR(dentry);
- goto out_release;
- }
-
- /* Create the cgroup directory, which also creates the cgroup */
- ret = vfs_mkdir(inode, dentry, 0755);
- child = __d_cgrp(dentry);
- dput(dentry);
- if (ret) {
- printk(KERN_INFO
- "Failed to create cgroup %s: %d\n", nodename,
- ret);
- goto out_release;
- }
-
- /* The cgroup now exists. Retake cgroup_mutex and check
- * that we're still in the same state that we thought we
- * were. */
- mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
- (parent != task_cgroup(tsk, subsys->subsys_id))) {
- /* Aargh, we raced ... */
- mutex_unlock(&inode->i_mutex);
- put_css_set(cg);
-
- deactivate_super(root->sb);
- /* The cgroup is still accessible in the VFS, but
- * we're not going to try to rmdir() it at this
- * point. */
- printk(KERN_INFO
- "Race in cgroup_clone() - leaking cgroup %s\n",
- nodename);
- goto again;
- }
-
- /* do any required auto-setup */
- for_each_subsys(root, ss) {
- if (ss->post_clone)
- ss->post_clone(ss, child);
- }
-
- /* All seems fine. Finish by moving the task into the new cgroup */
- ret = cgroup_attach_task(child, tsk);
- mutex_unlock(&cgroup_mutex);
-
- out_release:
- mutex_unlock(&inode->i_mutex);
-
- mutex_lock(&cgroup_mutex);
- put_css_set(cg);
- mutex_unlock(&cgroup_mutex);
- deactivate_super(root->sb);
- return ret;
-}
-
-/**
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
* @cgrp: the cgroup in question
* @task: the task in question
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c3..e691818d7e4 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
+ return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
+ if (__cgroup_freezing_or_frozen(tsk)) {
rcu_read_unlock();
return -EBUSY;
}
rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
+ .can_attach_task = freezer_can_attach_task,
+ .pre_attach = NULL,
+ .attach_task = NULL,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index 9214dcd087b..fc9eb093acd 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
return compat_jiffies_to_clock_t(jiffies);
}
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
@@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
return ret;
}
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
compat_old_sigset_t __user *oset)
{
@@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
return ret;
}
+#endif
+
asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2bb8c2e98ff..9c9b7545c81 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
{
int err;
+ struct cpuset *cs = cgroup_cs(cont);
+
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- cpuset_change_task_nodemask(tsk, to);
+ cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, tsk);
-
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+ struct cgroup *oldcont, struct task_struct *tsk)
{
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- static nodemask_t to; /* protected by cgroup_mutex */
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, &to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, &to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, &to, cs);
- }
- rcu_read_unlock();
- }
-
- /* change mm; only needs to be done once even if threadgroup */
- to = cs->mems_allowed;
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, &to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
}
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
}
/*
- * post_clone() is called at the end of cgroup_clone().
- * 'cgroup' was just created automatically as a result of
- * a cgroup_clone(), and the current task is about to
- * be moved into 'cgroup'.
+ * post_clone() is called during cgroup_create() when the
+ * clone_children mount argument was specified. The cgroup
+ * can not yet have any tasks.
*
* Currently we refuse to set up the cgroup - thereby
* refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
+ .can_attach_task = cpuset_can_attach_task,
+ .pre_attach = cpuset_pre_attach,
+ .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs = task_cs(tsk);
if (cs)
- cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* Like above we can temporary set any mask and rely on
* set_cpus_allowed_ptr() as synchronization point.
*/
- cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+ do_set_cpus_allowed(tsk, cpu_possible_mask);
cpu = cpumask_any(cpu_active_mask);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 8093c16b84b..174fa84eca3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -49,10 +49,10 @@ struct cred init_cred = {
.magic = CRED_MAGIC,
#endif
.securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_INIT_INH_SET,
+ .cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_INIT_EFF_SET,
- .cap_bset = CAP_INIT_BSET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c09767f7db3..d863b3c057b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5028,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
else
perf_event_output(event, nmi, data, regs);
+ if (event->fasync && event->pending_kill) {
+ if (nmi) {
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending);
+ } else
+ perf_event_wakeup(event);
+ }
+
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b44d82b823..0276c30401a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,7 +59,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -383,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
- tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_prio_tree_add(tmp, mpnt);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -522,11 +520,12 @@ struct mm_struct * mm_alloc(void)
struct mm_struct * mm;
mm = allocate_mm();
- if (mm) {
- memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
- }
- return mm;
+ if (!mm)
+ return NULL;
+
+ memset(mm, 0, sizeof(*mm));
+ mm_init_cpumask(mm);
+ return mm_init(mm, current);
}
/*
@@ -573,6 +572,57 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+ mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+ mm->num_exe_file_vmas--;
+ if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+ fput(mm->exe_file);
+ mm->exe_file = NULL;
+ }
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ if (new_exe_file)
+ get_file(new_exe_file);
+ if (mm->exe_file)
+ fput(mm->exe_file);
+ mm->exe_file = new_exe_file;
+ mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ /* We need mmap_sem to protect against races with removal of
+ * VM_EXECUTABLE vmas */
+ down_read(&mm->mmap_sem);
+ exe_file = mm->exe_file;
+ if (exe_file)
+ get_file(exe_file);
+ up_read(&mm->mmap_sem);
+ return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ /* It's safe to write the exe_file pointer without exe_file_lock because
+ * this is called during fork when the task is not yet in /proc */
+ newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
/**
* get_task_mm - acquire a reference to the task's mm
*
@@ -679,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
+ mm_init_cpumask(mm);
/* Initializing for Swap token stuff */
mm->token_priority = 0;
@@ -927,6 +978,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1108,6 +1163,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1193,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
- if (current->nsproxy != p->nsproxy) {
- retval = ns_cgroup_clone(p, pid);
- if (retval)
- goto bad_fork_free_pid;
- }
-
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
@@ -1312,6 +1363,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1350,6 +1403,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
@@ -1507,6 +1562,13 @@ void __init proc_caches_init(void)
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ /*
+ * FIXME! The "sizeof(struct mm_struct)" currently includes the
+ * whole struct cpumask for the OFFSTACK case. We could change
+ * this to *only* allocate as much of it as required by the
+ * maximum number of CPU's we can ever have. The cpumask_allocation
+ * is at the end of the structure, exactly for that reason.
+ */
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c541ee527ec..a9205e32a05 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -748,7 +748,7 @@ static inline void retrigger_next_event(void *arg) { }
*/
void clock_was_set(void)
{
-#ifdef CONFIG_HIGHRES_TIMERS
+#ifdef CONFIG_HIGH_RES_TIMERS
/* Retrigger the CPU local events everywhere */
on_each_cpu(retrigger_next_event, NULL, 1);
#endif
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 834899f2500..4bd4faa6323 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-static int irq_affinity_proc_show(struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask = desc->irq_data.affinity;
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
if (irqd_is_setaffinity_pending(&desc->irq_data))
mask = desc->pending_mask;
#endif
- seq_cpumask(m, mask);
+ if (type)
+ seq_cpumask_list(m, mask);
+ else
+ seq_cpumask(m, mask);
seq_putc(m, '\n');
return 0;
}
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
#endif
int no_irq_affinity;
-static ssize_t irq_affinity_proc_write(struct file *file,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(0, m, v);
+}
+
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(1, m, v);
+}
+
+
+static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
- err = cpumask_parse_user(buffer, count, new_value);
+ if (type)
+ err = cpumask_parselist_user(buffer, count, new_value);
+ else
+ err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto free_cpumask;
@@ -100,11 +117,28 @@ free_cpumask:
return err;
}
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(0, file, buffer, count, pos);
+}
+
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(1, file, buffer, count, pos);
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+}
+
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
.release = single_release,
};
+static const struct file_operations irq_affinity_list_proc_fops = {
+ .open = irq_affinity_list_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_list_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_data("affinity_hint", 0400, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
+ /* create /proc/irq/<irq>/smp_affinity_list */
+ proc_create_data("smp_affinity_list", 0600, desc->dir,
+ &irq_affinity_list_proc_fops, (void *)(long)irq);
+
proc_create_data("node", 0444, desc->dir,
&irq_node_proc_fops, (void *)(long)irq);
#endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
remove_proc_entry("smp_affinity", desc->dir);
remove_proc_entry("affinity_hint", desc->dir);
+ remove_proc_entry("smp_affinity_list", desc->dir);
remove_proc_entry("node", desc->dir);
#endif
remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 74d1c099fbd..fa27e750dbc 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -105,9 +105,12 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
}
static void __jump_label_update(struct jump_label_key *key,
- struct jump_entry *entry, int enable)
+ struct jump_entry *entry,
+ struct jump_entry *stop, int enable)
{
- for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
+ for (; (entry < stop) &&
+ (entry->key == (jump_label_t)(unsigned long)key);
+ entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
@@ -181,7 +184,11 @@ static void __jump_label_mod_update(struct jump_label_key *key, int enable)
struct jump_label_mod *mod = key->next;
while (mod) {
- __jump_label_update(key, mod->entries, enable);
+ struct module *m = mod->mod;
+
+ __jump_label_update(key, mod->entries,
+ m->jump_entries + m->num_jump_entries,
+ enable);
mod = mod->next;
}
}
@@ -245,7 +252,8 @@ static int jump_label_add_module(struct module *mod)
key->next = jlm;
if (jump_label_enabled(key))
- __jump_label_update(key, iter, JUMP_LABEL_ENABLE);
+ __jump_label_update(key, iter, iter_stop,
+ JUMP_LABEL_ENABLE);
}
return 0;
@@ -371,7 +379,7 @@ static void jump_label_update(struct jump_label_key *key, int enable)
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, enable);
+ __jump_label_update(key, entry, __stop___jump_table, enable);
#ifdef CONFIG_MODULES
__jump_label_mod_update(key, enable);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5ae0ff38425..ad6a81c58b4 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
+#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+
#ifdef CONFIG_MODULES
/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
+ struct cred *new;
int retval;
spin_lock_irq(&current->sighand->siglock);
@@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data)
goto fail;
}
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto fail;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ commit_creds(new);
+
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
@@ -420,6 +442,84 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
+
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bc..4ba7cccb499 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
return;
}
- p->cpus_allowed = cpumask_of_cpu(cpu);
- p->rt.nr_cpus_allowed = 1;
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
}
EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 2c938e2337c..d607ed5dd44 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
*/
static inline int __sched
__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
- unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned long flags;
preempt_disable();
- mutex_acquire(&lock->dep_map, subclass, 0, ip);
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
@@ -269,16 +269,25 @@ void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
int __sched
mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -287,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, _RET_IP_);
+ subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -393,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -401,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -409,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
#endif
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0..00000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * ns_cgroup.c - namespace cgroup subsystem
- *
- * Copyright 2006, 2007 IBM Corp
- */
-
-#include <linux/module.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-
-struct ns_cgroup {
- struct cgroup_subsys_state css;
-};
-
-struct cgroup_subsys ns_subsys;
-
-static inline struct ns_cgroup *cgroup_to_ns(
- struct cgroup *cgroup)
-{
- return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
- struct ns_cgroup, css);
-}
-
-int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
-{
- char name[PROC_NUMBUF];
-
- snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
- return cgroup_clone(task, &ns_subsys, name);
-}
-
-/*
- * Rules:
- * 1. you can only enter a cgroup which is a descendant of your current
- * cgroup
- * 2. you can only place another process into a cgroup if
- * a. you have CAP_SYS_ADMIN
- * b. your cgroup is an ancestor of task's destination cgroup
- * (hence either you are in the same cgroup as task, or in an
- * ancestor cgroup thereof)
- */
-static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
-{
- if (current != task) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!cgroup_is_descendant(new_cgroup, current))
- return -EPERM;
- }
-
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
-
- return 0;
-}
-
-/*
- * Rules: you can only create a cgroup if
- * 1. you are capable(CAP_SYS_ADMIN)
- * 2. the target cgroup is a descendant of your own cgroup
- */
-static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct ns_cgroup *ns_cgroup;
-
- if (!capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
- if (!cgroup_is_descendant(cgroup, current))
- return ERR_PTR(-EPERM);
- if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
- printk("ns_cgroup can't be created with parent "
- "'clone_children' set.\n");
- return ERR_PTR(-EINVAL);
- }
-
- printk_once("ns_cgroup deprecated: consider using the "
- "'clone_children' flag without the ns_cgroup.\n");
-
- ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
- if (!ns_cgroup)
- return ERR_PTR(-ENOMEM);
- return &ns_cgroup->css;
-}
-
-static void ns_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct ns_cgroup *ns_cgroup;
-
- ns_cgroup = cgroup_to_ns(cgroup);
- kfree(ns_cgroup);
-}
-
-struct cgroup_subsys ns_subsys = {
- .name = "ns",
- .can_attach = ns_can_attach,
- .create = ns_create,
- .destroy = ns_destroy,
- .subsys_id = ns_subsys_id,
-};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd..d6a00f3de15 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
static struct kmem_cache *nsproxy_cachep;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
goto out;
}
- err = ns_cgroup_clone(current, task_pid(current));
- if (err)
- put_nsproxy(*new_nsp);
-
out:
return err;
}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+ const struct proc_ns_operations *ops;
+ struct task_struct *tsk = current;
+ struct nsproxy *new_nsproxy;
+ struct proc_inode *ei;
+ struct file *file;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ err = -EINVAL;
+ ei = PROC_I(file->f_dentry->d_inode);
+ ops = ei->ns_ops;
+ if (nstype && (ops->type != nstype))
+ goto out;
+
+ new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ if (IS_ERR(new_nsproxy)) {
+ err = PTR_ERR(new_nsproxy);
+ goto out;
+ }
+
+ err = ops->install(new_nsproxy, ei->ns);
+ if (err) {
+ free_nsproxy(new_nsproxy);
+ goto out;
+ }
+ switch_task_namespaces(tsk, new_nsproxy);
+out:
+ fput(file);
+ return err;
+}
+
static int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index beb184689af..6824ca7d4d0 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/platform_device.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/uaccess.h>
@@ -53,11 +54,17 @@ enum pm_qos_type {
PM_QOS_MIN /* return the smallest value */
};
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically. Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
struct pm_qos_object {
struct plist_head requests;
struct blocking_notifier_head *notifiers;
struct miscdevice pm_qos_power_miscdev;
char *name;
+ s32 target_value; /* Do not change to 64 bit */
s32 default_value;
enum pm_qos_type type;
};
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
- .default_value = 2000 * USEC_PER_SEC,
+ .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
};
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
.notifiers = &network_lat_notifier,
.name = "network_latency",
- .default_value = 2000 * USEC_PER_SEC,
+ .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN
};
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
- .default_value = 0,
+ .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.type = PM_QOS_MAX,
};
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
}
}
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+ return o->target_value;
+}
+
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+ o->target_value = value;
+}
+
static void update_target(struct pm_qos_object *o, struct plist_node *node,
int del, int value)
{
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
plist_add(node, &o->requests);
}
curr_value = pm_qos_get_value(o);
+ pm_qos_set_value(o, curr_value);
spin_unlock_irqrestore(&pm_qos_lock, flags);
if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
* pm_qos_request - returns current system wide qos expectation
* @pm_qos_class: identification of which qos value is requested
*
- * This function returns the current target value in an atomic manner.
+ * This function returns the current target value.
*/
int pm_qos_request(int pm_qos_class)
{
- unsigned long flags;
- int value;
-
- spin_lock_irqsave(&pm_qos_lock, flags);
- value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
-
- return value;
+ return pm_qos_read_value(pm_qos_array[pm_qos_class]);
}
EXPORT_SYMBOL_GPL(pm_qos_request);
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos)
{
s32 value;
- int x;
- char ascii_value[11];
struct pm_qos_request_list *pm_qos_req;
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
- } else if (count == 11) { /* len('0x12345678/0') */
- if (copy_from_user(ascii_value, buf, 11))
+ } else if (count <= 11) { /* ASCII perhaps? */
+ char ascii_value[11];
+ unsigned long int ulval;
+ int ret;
+
+ if (copy_from_user(ascii_value, buf, count))
return -EFAULT;
- if (strlen(ascii_value) != 10)
- return -EINVAL;
- x = sscanf(ascii_value, "%x", &value);
- if (x != 1)
+
+ if (count > 10) {
+ if (ascii_value[10] == '\n')
+ ascii_value[10] = '\0';
+ else
+ return -EINVAL;
+ } else {
+ ascii_value[count] = '\0';
+ }
+ ret = strict_strtoul(ascii_value, 16, &ulval);
+ if (ret) {
+ pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
return -EINVAL;
- pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
- } else
+ }
+ value = (s32)lower_32_bits(ulval);
+ } else {
return -EINVAL;
+ }
pm_qos_req = filp->private_data;
pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a1b5edf1bf9..4556182527f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
return tmr;
}
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+ struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
#define IT_ID_SET 1
#define IT_ID_NOT_SET 0
static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
- kmem_cache_free(posix_timers_cache, tmr);
+ call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
}
static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
- /*
- * Watch out here. We do a irqsave on the idr_lock and pass the
- * flags part over to the timer lock. Must not let interrupts in
- * while we are moving the lock.
- */
- spin_lock_irqsave(&idr_lock, *flags);
+
+ rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
- spin_lock(&timr->it_lock);
+ spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
- spin_unlock(&idr_lock);
+ rcu_read_unlock();
return timr;
}
- spin_unlock(&timr->it_lock);
+ spin_unlock_irqrestore(&timr->it_lock, *flags);
}
- spin_unlock_irqrestore(&idr_lock, *flags);
+ rcu_read_unlock();
return NULL;
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f9bec56d882..8f7b1db1ece 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
#include "power.h"
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
static const struct platform_hibernation_ops *hibernation_ops;
/**
- * hibernation_set_ops - set the global hibernate operations
- * @ops: the hibernation operations to use in subsequent hibernation transitions
+ * hibernation_set_ops - Set the global hibernate operations.
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
*/
-
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
#endif /* !CONFIG_PM_DEBUG */
/**
- * platform_begin - tell the platform driver that we're starting
- * hibernation
+ * platform_begin - Call platform to start hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
}
/**
- * platform_end - tell the platform driver that we've entered the
- * working state
+ * platform_end - Call platform to finish transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
}
/**
- * platform_pre_snapshot - prepare the machine for hibernation using the
- * platform driver if so configured and return an error code if it fails
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
*/
static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
}
/**
- * platform_leave - prepare the machine for switching to the normal mode
- * of operation using the platform driver (called with interrupts disabled)
+ * platform_leave - Call platform to prepare a transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
*/
-
static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
}
/**
- * platform_finish - switch the machine to the normal mode of operation
- * using the platform driver (must be called after platform_prepare())
+ * platform_finish - Call platform to switch the system to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
*/
-
static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
}
/**
- * platform_pre_restore - prepare the platform for the restoration from a
- * hibernation image. If the restore fails after this function has been
- * called, platform_restore_cleanup() must be called.
+ * platform_pre_restore - Prepare for hibernate image restoration.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
*/
-
static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
}
/**
- * platform_restore_cleanup - switch the platform to the normal mode of
- * operation after a failing restore. If platform_pre_restore() has been
- * called before the failing restore, this function must be called too,
- * regardless of the result of platform_pre_restore().
+ * platform_restore_cleanup - Switch to the working state after failing restore.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
*/
-
static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
}
/**
- * platform_recover - recover the platform from a failure to suspend
- * devices.
+ * platform_recover - Recover from a failure to suspend devices.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
}
/**
- * swsusp_show_speed - print the time elapsed between two events.
- * @start: Starting event.
- * @stop: Final event.
- * @nr_pages - number of pages processed between @start and @stop
- * @msg - introductory message to print
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
+ * @start: Starting event.
+ * @stop: Final event.
+ * @nr_pages: Number of memory pages processed between @start and @stop.
+ * @msg: Additional diagnostic message to print.
*/
-
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
}
/**
- * create_image - freeze devices that need to be frozen with interrupts
- * off, create the hibernation image and thaw those devices. Control
- * reappears in this routine after a restore.
+ * create_image - Create a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * and execute the drivers' .thaw_noirq() callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
*/
-
static int create_image(int platform_mode)
{
int error;
- error = arch_prepare_suspend();
- if (error)
- return error;
-
- /* At this point, dpm_suspend_start() has been called, but *not*
- * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
- * Otherwise, drivers for some devices (e.g. interrupt controllers)
- * become desynchronized with the actual state of the hardware
- * at resume time, and evil weirdness ensues.
- */
error = dpm_suspend_noirq(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -297,9 +303,6 @@ static int create_image(int platform_mode)
Power_up:
syscore_resume();
- /* NOTE: dpm_resume_noirq() is just a resume() for devices
- * that suspended with irqs off ... no overall powerup.
- */
Enable_irqs:
local_irq_enable();
@@ -317,14 +320,11 @@ static int create_image(int platform_mode)
}
/**
- * hibernation_snapshot - quiesce devices and create the hibernation
- * snapshot image.
- * @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform firmware for the power transition.
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
*
- * Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.
*/
-
int hibernation_snapshot(int platform_mode)
{
pm_message_t msg = PMSG_RECOVER;
@@ -384,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
}
/**
- * resume_target_kernel - prepare devices that need to be suspended with
- * interrupts off, restore the contents of highmem that have not been
- * restored yet from the image and run the low level code that will restore
- * the remaining contents of memory and switch to the just restored target
- * kernel.
+ * resume_target_kernel - Restore system state from a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
+ * highmem that have not been restored yet from the image and run the low-level
+ * code that will restore the remaining contents of memory and switch to the
+ * just restored target kernel.
*/
-
static int resume_target_kernel(bool platform_mode)
{
int error;
@@ -416,24 +417,26 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Enable_irqs;
- /* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = restore_highmem();
if (!error) {
error = swsusp_arch_resume();
/*
* The code below is only ever reached in case of a failure.
- * Otherwise execution continues at place where
- * swsusp_arch_suspend() was called
+ * Otherwise, execution continues at the place where
+ * swsusp_arch_suspend() was called.
*/
BUG_ON(!error);
- /* This call to restore_highmem() undos the previous one */
+ /*
+ * This call to restore_highmem() reverts the changes made by
+ * the previous one.
+ */
restore_highmem();
}
/*
* The only reason why swsusp_arch_resume() can fail is memory being
* very tight, so we have to free it as soon as we can to avoid
- * subsequent failures
+ * subsequent failures.
*/
swsusp_free();
restore_processor_state();
@@ -456,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
}
/**
- * hibernation_restore - quiesce devices and restore the hibernation
- * snapshot image. If successful, control returns in hibernation_snaphot()
- * @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform firmware for the transition.
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
*
- * Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held. If it is successful, control
+ * reappears in the restored target kernel in hibernation_snaphot().
*/
-
int hibernation_restore(int platform_mode)
{
int error;
@@ -483,10 +484,8 @@ int hibernation_restore(int platform_mode)
}
/**
- * hibernation_platform_enter - enter the hibernation state using the
- * platform driver (if available)
+ * hibernation_platform_enter - Power off the system using the platform driver.
*/
-
int hibernation_platform_enter(void)
{
int error;
@@ -557,12 +556,12 @@ int hibernation_platform_enter(void)
}
/**
- * power_down - Shut the machine down for hibernation.
+ * power_down - Shut the machine down for hibernation.
*
- * Use the platform driver, if configured so; otherwise try
- * to power off or reboot.
+ * Use the platform driver, if configured, to put the system into the sleep
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
*/
-
static void power_down(void)
{
switch (hibernation_mode) {
@@ -599,9 +598,8 @@ static int prepare_processes(void)
}
/**
- * hibernate - The granpappy of the built-in hibernation management
+ * hibernate - Carry out system hibernation, including saving the image.
*/
-
int hibernate(void)
{
int error;
@@ -679,17 +677,20 @@ int hibernate(void)
/**
- * software_resume - Resume from a saved image.
+ * software_resume - Resume from a saved hibernation image.
*
- * Called as a late_initcall (so all devices are discovered and
- * initialized), we call swsusp to see if we have a saved image or not.
- * If so, we quiesce devices, the restore the saved image. We will
- * return above (in hibernate() ) if everything goes well.
- * Otherwise, we fail gracefully and return to the normally
- * scheduled program.
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
*
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
*/
-
static int software_resume(void)
{
int error;
@@ -819,21 +820,17 @@ static const char * const hibernation_modes[] = {
[HIBERNATION_TESTPROC] = "testproc",
};
-/**
- * disk - Control hibernation mode
- *
- * Suspend-to-disk can be handled in several ways. We have a few options
- * for putting the system to sleep - using the platform driver (e.g. ACPI
- * or other hibernation_ops), powering off the system or rebooting the
- * system (for testing) as well as the two test modes.
+/*
+ * /sys/power/disk - Control hibernation mode.
*
- * The system can support 'platform', and that is known a priori (and
- * encoded by the presence of hibernation_ops). However, the user may
- * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
- * test modes, 'test' or 'testproc'.
+ * Hibernation can be handled in several ways. There are a few different ways
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
+ * or other hibernation_ops), powering it off or rebooting it (for testing
+ * mostly), or using one of the two available test modes.
*
- * show() will display what the mode is currently set to.
- * store() will accept one of
+ * The sysfs file /sys/power/disk provides an interface for selecting the
+ * hibernation mode to use. Reading from this file causes the available modes
+ * to be printed. There are 5 modes that can be supported:
*
* 'platform'
* 'shutdown'
@@ -841,8 +838,14 @@ static const char * const hibernation_modes[] = {
* 'test'
* 'testproc'
*
- * It will only change to 'platform' if the system
- * supports it (as determined by having hibernation_ops).
+ * If a platform hibernation driver is in use, 'platform' will be supported
+ * and will be used by default. Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
*/
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -875,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
return buf-start;
}
-
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
diff --git a/kernel/printk.c b/kernel/printk.c
index da8ca817eae..35185392173 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)
}
#endif
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+
+/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
- unsigned long flags;
if (size)
size = roundup_pow_of_two(size);
- if (size > log_buf_len) {
- unsigned start, dest_idx, offset;
- char *new_log_buf;
+ if (size > log_buf_len)
+ new_log_buf_len = size;
- new_log_buf = alloc_bootmem(size);
- if (!new_log_buf) {
- printk(KERN_WARNING "log_buf_len: allocation failed\n");
- goto out;
- }
+ return 0;
+}
+early_param("log_buf_len", log_buf_len_setup);
- spin_lock_irqsave(&logbuf_lock, flags);
- log_buf_len = size;
- log_buf = new_log_buf;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+void __init setup_log_buf(int early)
+{
+ unsigned long flags;
+ unsigned start, dest_idx, offset;
+ char *new_log_buf;
+ int free;
+
+ if (!new_log_buf_len)
+ return;
+
+ if (early) {
+ unsigned long mem;
- printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+ mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
+ return;
+ new_log_buf = __va(mem);
+ } else {
+ new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
}
-out:
- return 1;
-}
-__setup("log_buf_len=", log_buf_len_setup);
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
+ return;
+ }
+
+ spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = new_log_buf_len;
+ log_buf = new_log_buf;
+ new_log_buf_len = 0;
+ free = __LOG_BUF_LEN - log_end;
+
+ offset = start = min(con_start, log_start);
+ dest_idx = 0;
+ while (start != log_end) {
+ unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+
+ log_buf[dest_idx] = __log_buf[log_idx_mask];
+ start++;
+ dest_idx++;
+ }
+ log_start -= offset;
+ con_start -= offset;
+ log_end -= offset;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("early log buf free: %d(%d%%)\n",
+ free, (free * 100) / __LOG_BUF_LEN);
+}
#ifdef CONFIG_BOOT_PRINTK_DELAY
diff --git a/kernel/profile.c b/kernel/profile.c
index 14c9f87b9fc..961b389fe52 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -303,14 +303,12 @@ static void profile_discard_flip_buffers(void)
mutex_unlock(&profile_flip_mutex);
}
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
int i, j, cpu;
struct profile_hit *hits;
- if (prof_on != type || !prof_buffer)
- return;
pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -417,16 +415,20 @@ out_free:
#define profile_discard_flip_buffers() do { } while (0)
#define profile_cpu_callback NULL
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
-
- if (prof_on != type || !prof_buffer)
- return;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ if (prof_on != type || !prof_buffer)
+ return;
+ do_profile_hits(type, __pc, nr_hits);
+}
EXPORT_SYMBOL_GPL(profile_hits);
void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 7a81fc07134..2df115790cd 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -562,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request,
}
child->exit_code = data;
- wake_up_process(child);
+ wake_up_state(child, __TASK_TRACED);
return 0;
}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f07d2f03181..77a7671dd14 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
@@ -95,7 +95,6 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
static char rcu_kthreads_spawnable;
@@ -163,7 +162,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
- .dynticks = 1,
+ .dynticks = ATOMIC_INIT(1),
};
#endif /* #ifdef CONFIG_NO_HZ */
@@ -322,13 +321,25 @@ void rcu_enter_nohz(void)
unsigned long flags;
struct rcu_dynticks *rdtp;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
- rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
+ if (--rdtp->dynticks_nesting) {
+ local_irq_restore(flags);
+ return;
+ }
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
local_irq_restore(flags);
+
+ /* If the interrupt queued a callback, get out of dyntick mode. */
+ if (in_irq() &&
+ (__get_cpu_var(rcu_sched_data).nxtlist ||
+ __get_cpu_var(rcu_bh_data).nxtlist ||
+ rcu_preempt_needs_cpu(smp_processor_id())))
+ set_need_resched();
}
/*
@@ -344,11 +355,16 @@ void rcu_exit_nohz(void)
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+ if (rdtp->dynticks_nesting++) {
+ local_irq_restore(flags);
+ return;
+ }
+ smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
local_irq_restore(flags);
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
/**
@@ -362,11 +378,15 @@ void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 &&
+ (atomic_read(&rdtp->dynticks) & 0x1))
return;
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rdtp->dynticks_nmi_nesting++;
+ smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
/**
@@ -380,11 +400,14 @@ void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 ||
+ --rdtp->dynticks_nmi_nesting != 0)
return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
/**
@@ -395,13 +418,7 @@ void rcu_nmi_exit(void)
*/
void rcu_irq_enter(void)
{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (rdtp->dynticks_nesting++)
- return;
- rdtp->dynticks++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rcu_exit_nohz();
}
/**
@@ -413,18 +430,7 @@ void rcu_irq_enter(void)
*/
void rcu_irq_exit(void)
{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (--rdtp->dynticks_nesting)
- return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks++;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
-
- /* If the interrupt queued a callback, get out of dyntick mode. */
- if (__this_cpu_read(rcu_sched_data.nxtlist) ||
- __this_cpu_read(rcu_bh_data.nxtlist))
- set_need_resched();
+ rcu_enter_nohz();
}
#ifdef CONFIG_SMP
@@ -436,19 +442,8 @@ void rcu_irq_exit(void)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- int ret;
- int snap;
- int snap_nmi;
-
- snap = rdp->dynticks->dynticks;
- snap_nmi = rdp->dynticks->dynticks_nmi;
- smp_mb(); /* Order sampling of snap with end of grace period. */
- rdp->dynticks_snap = snap;
- rdp->dynticks_nmi_snap = snap_nmi;
- ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
- if (ret)
- rdp->dynticks_fqs++;
- return ret;
+ rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ return 0;
}
/*
@@ -459,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
- long curr;
- long curr_nmi;
- long snap;
- long snap_nmi;
+ unsigned long curr;
+ unsigned long snap;
- curr = rdp->dynticks->dynticks;
- snap = rdp->dynticks_snap;
- curr_nmi = rdp->dynticks->dynticks_nmi;
- snap_nmi = rdp->dynticks_nmi_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+ curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned long)rdp->dynticks_snap;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -478,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr != snap || (curr & 0x1) == 0) &&
- (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+ if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
rdp->dynticks_fqs++;
return 1;
}
@@ -908,6 +897,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
unsigned long gp_duration;
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+
+ /*
+ * Ensure that all grace-period and pre-grace-period activity
+ * is seen before the assignment to rsp->completed.
+ */
+ smp_mb(); /* See above block comment. */
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
@@ -1455,25 +1450,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static void rcu_process_callbacks(void)
{
- /*
- * Memory references from any prior RCU read-side critical sections
- * executed by the interrupted code must be seen before any RCU
- * grace-period manipulations below.
- */
- smp_mb(); /* See above block comment. */
-
__rcu_process_callbacks(&rcu_sched_state,
&__get_cpu_var(rcu_sched_data));
__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
rcu_preempt_process_callbacks();
- /*
- * Memory references from any later RCU read-side critical sections
- * executed by the interrupted code must be seen after any RCU
- * grace-period manipulations above.
- */
- smp_mb(); /* See above block comment. */
-
/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
rcu_needs_cpu_flush();
}
@@ -1494,7 +1475,7 @@ static void invoke_rcu_cpu_kthread(void)
local_irq_restore(flags);
return;
}
- wake_up(&__get_cpu_var(rcu_cpu_wq));
+ wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
local_irq_restore(flags);
}
@@ -1544,13 +1525,10 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
*/
static void rcu_cpu_kthread_timer(unsigned long arg)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->wakemask |= rdp->grpmask;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ atomic_or(rdp->grpmask, &rnp->wakemask);
invoke_rcu_node_kthread(rnp);
}
@@ -1617,14 +1595,12 @@ static int rcu_cpu_kthread(void *arg)
unsigned long flags;
int spincnt = 0;
unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
- wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
char work;
char *workp = &per_cpu(rcu_cpu_has_work, cpu);
for (;;) {
*statusp = RCU_KTHREAD_WAITING;
- wait_event_interruptible(*wqp,
- *workp != 0 || kthread_should_stop());
+ rcu_wait(*workp != 0 || kthread_should_stop());
local_bh_disable();
if (rcu_cpu_kthread_should_stop(cpu)) {
local_bh_enable();
@@ -1672,10 +1648,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
if (IS_ERR(t))
return PTR_ERR(t);
kthread_bind(t, cpu);
+ set_task_state(t, TASK_INTERRUPTIBLE);
per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t);
sp.sched_priority = RCU_KTHREAD_PRIO;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
return 0;
@@ -1698,11 +1674,10 @@ static int rcu_node_kthread(void *arg)
for (;;) {
rnp->node_kthread_status = RCU_KTHREAD_WAITING;
- wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
+ rcu_wait(atomic_read(&rnp->wakemask) != 0);
rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
raw_spin_lock_irqsave(&rnp->lock, flags);
- mask = rnp->wakemask;
- rnp->wakemask = 0;
+ mask = atomic_xchg(&rnp->wakemask, 0);
rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
if ((mask & 0x1) == 0)
@@ -1781,9 +1756,9 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
if (IS_ERR(t))
return PTR_ERR(t);
raw_spin_lock_irqsave(&rnp->lock, flags);
+ set_task_state(t, TASK_INTERRUPTIBLE);
rnp->node_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- wake_up_process(t);
sp.sched_priority = 99;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
@@ -1800,21 +1775,16 @@ static int __init rcu_spawn_kthreads(void)
rcu_kthreads_spawnable = 1;
for_each_possible_cpu(cpu) {
- init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
per_cpu(rcu_cpu_has_work, cpu) = 0;
if (cpu_online(cpu))
(void)rcu_spawn_one_cpu_kthread(cpu);
}
rnp = rcu_get_root(rcu_state);
- init_waitqueue_head(&rnp->node_wq);
- rcu_init_boost_waitqueue(rnp);
(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- if (NUM_RCU_NODES > 1)
- rcu_for_each_leaf_node(rcu_state, rnp) {
- init_waitqueue_head(&rnp->node_wq);
- rcu_init_boost_waitqueue(rnp);
+ if (NUM_RCU_NODES > 1) {
+ rcu_for_each_leaf_node(rcu_state, rnp)
(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
+ }
return 0;
}
early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 257664815d5..7b9a08b4aae 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,11 +84,9 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- int dynticks_nesting; /* Track nesting level, sort of. */
- int dynticks; /* Even value for dynticks-idle, else odd. */
- int dynticks_nmi; /* Even value for either dynticks-idle or */
- /* not in nmi handler, else odd. So this */
- /* remains even for nmi from irq handler. */
+ int dynticks_nesting; /* Track irq/process nesting level. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
};
/* RCU's kthread states for tracing. */
@@ -121,7 +119,9 @@ struct rcu_node {
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
- unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
+ atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
+ /* Since this has meaning only for leaf */
+ /* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -159,9 +159,6 @@ struct rcu_node {
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
- wait_queue_head_t boost_wq;
- /* Wait queue on which to park the boost */
- /* kthread. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
unsigned long n_tasks_boosted;
@@ -188,9 +185,6 @@ struct rcu_node {
/* kthread that takes care of this rcu_node */
/* structure, for example, awakening the */
/* per-CPU kthreads as needed. */
- wait_queue_head_t node_wq;
- /* Wait queue on which to park the per-node */
- /* kthread. */
unsigned int node_kthread_status;
/* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
@@ -284,7 +278,6 @@ struct rcu_data {
/* 3) dynticks interface. */
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
#endif /* #ifdef CONFIG_NO_HZ */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -337,6 +330,16 @@ struct rcu_data {
/* scheduling clock irq */
/* before ratting on them. */
+#define rcu_wait(cond) \
+do { \
+ for (;;) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ schedule(); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
/*
* RCU global state, including node hierarchy. This hierarchy is
@@ -446,7 +449,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
cpumask_var_t cm);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3f6559a5f5c..a767b7dac36 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1196,8 +1196,7 @@ static int rcu_boost_kthread(void *arg)
for (;;) {
rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
- wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
- rnp->exp_tasks);
+ rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
more2boost = rcu_boost(rnp);
if (more2boost)
@@ -1275,14 +1274,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
}
/*
- * Initialize the RCU-boost waitqueue.
- */
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
- init_waitqueue_head(&rnp->boost_wq);
-}
-
-/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
* Returns zero if all is well, a negated errno otherwise.
@@ -1304,9 +1295,9 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
if (IS_ERR(t))
return PTR_ERR(t);
raw_spin_lock_irqsave(&rnp->lock, flags);
+ set_task_state(t, TASK_INTERRUPTIBLE);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- wake_up_process(t);
sp.sched_priority = RCU_KTHREAD_PRIO;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
return 0;
@@ -1328,10 +1319,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
-}
-
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp,
int rnp_index)
@@ -1520,7 +1507,6 @@ int rcu_needs_cpu(int cpu)
{
int c = 0;
int snap;
- int snap_nmi;
int thatcpu;
/* Check for being in the holdoff period. */
@@ -1531,10 +1517,10 @@ int rcu_needs_cpu(int cpu)
for_each_online_cpu(thatcpu) {
if (thatcpu == cpu)
continue;
- snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
- snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+ snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+ thatcpu).dynticks);
smp_mb(); /* Order sampling of snap with end of grace period. */
- if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+ if ((snap & 0x1) != 0) {
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
return rcu_needs_cpu_quick_check(cpu);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index aa0fd72b4bc..9678cc3650f 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -69,10 +69,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->passed_quiesc, rdp->passed_quiesc_completed,
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
- seq_printf(m, " dt=%d/%d dn=%d df=%lu",
- rdp->dynticks->dynticks,
+ seq_printf(m, " dt=%d/%d/%d df=%lu",
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -141,9 +141,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, ",%d,%d,%d,%lu",
- rdp->dynticks->dynticks,
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -167,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
{
seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
#ifdef CONFIG_NO_HZ
- seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
#ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/sched.c b/kernel/sched.c
index 2d12893b8b0..cbb3a0eee58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2573,7 +2573,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
if (!next)
smp_send_reschedule(cpu);
}
-#endif
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_cpu) {
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu)
{
@@ -2631,17 +2650,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
while (p->on_cpu) {
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
/*
- * If called from interrupt context we could have landed in the
- * middle of schedule(), in this case we should take care not
- * to spin on ->on_cpu if p is current, since that would
- * deadlock.
+ * In case the architecture enables interrupts in
+ * context_switch(), we cannot busy wait, since that
+ * would lead to deadlocks when an interrupt hits and
+ * tries to wake up @prev. So bail and do a complete
+ * remote wakeup.
*/
- if (p == current) {
- ttwu_queue(p, cpu);
+ if (ttwu_activate_remote(p, wake_flags))
goto stat;
- }
-#endif
+#else
cpu_relax();
+#endif
}
/*
* Pairs with the smp_wmb() in finish_lock_switch().
@@ -5841,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5929,6 +5948,16 @@ static inline void sched_init_granularity(void)
}
#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+}
+
/*
* This is how migration works:
*
@@ -5974,12 +6003,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
- else {
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- }
+ do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
@@ -8764,42 +8788,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
static void
@@ -8887,8 +8879,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e32a9b70ee9..433491c2dc8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
- update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
+
+ update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
}
/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 64b2a37c07d..88725c939e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
int best_cpu;
@@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task)
* remote processor.
*/
if (this_cpu != -1 &&
- cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
return this_cpu;
+ }
best_cpu = cpumask_first_and(lowest_mask,
sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids)
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
return best_cpu;
+ }
}
}
+ rcu_read_unlock();
/*
* And finally, if there were no matches within the domains
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0..331e01bcd02 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
#ifdef CONFIG_SMP
/* domain-specific stats */
- preempt_disable();
+ rcu_read_lock();
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
- preempt_enable();
+ rcu_read_unlock();
#endif
}
kfree(mask_str);
diff --git a/kernel/signal.c b/kernel/signal.c
index ad5e818baac..86c32b884f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3023,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
+ while (!signal_pending(current)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ }
return -ERESTARTNOHAND;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dd0c46fa3b..4fc92445a29 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
#include <linux/kprobes.h>
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
+#include <linux/kmod.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {
.child = random_table,
},
{
+ .procname = "usermodehelper",
+ .mode = 0555,
+ .child = usermodehelper_table,
+ },
+ {
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
@@ -1500,7 +1506,7 @@ static struct ctl_table fs_table[] = {
static struct ctl_table debug_table[] = {
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390)
+ defined(CONFIG_S390) || defined(CONFIG_TILE)
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d017c2c82c4..1ee417fcbfa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -109,12 +109,18 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
static void ftrace_global_list_func(unsigned long ip,
unsigned long parent_ip)
{
- struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
+ struct ftrace_ops *op;
+
+ if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+ return;
+ trace_recursion_set(TRACE_GLOBAL_BIT);
+ op = rcu_dereference_raw(ftrace_global_list); /*see above*/
while (op != &ftrace_list_end) {
op->func(ip, parent_ip);
op = rcu_dereference_raw(op->next); /*see above*/
};
+ trace_recursion_clear(TRACE_GLOBAL_BIT);
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -1638,12 +1644,12 @@ static void ftrace_startup_enable(int command)
ftrace_run_update_code(command);
}
-static void ftrace_startup(struct ftrace_ops *ops, int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
{
bool hash_enable = true;
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
ftrace_start_up++;
command |= FTRACE_ENABLE_CALLS;
@@ -1662,6 +1668,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
+
+ return 0;
}
static void ftrace_shutdown(struct ftrace_ops *ops, int command)
@@ -2501,7 +2509,7 @@ static void __enable_ftrace_function_probe(void)
ret = __register_ftrace_function(&trace_probe_ops);
if (!ret)
- ftrace_startup(&trace_probe_ops, 0);
+ ret = ftrace_startup(&trace_probe_ops, 0);
ftrace_probe_registered = 1;
}
@@ -3466,7 +3474,11 @@ device_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) do { } while (0)
+# define ftrace_startup(ops, command) \
+ ({ \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ 0; \
+ })
# define ftrace_shutdown(ops, command) do { } while (0)
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
@@ -3484,6 +3496,10 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
{
struct ftrace_ops *op;
+ if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+ return;
+
+ trace_recursion_set(TRACE_INTERNAL_BIT);
/*
* Some of the ops may be dynamically allocated,
* they must be freed after a synchronize_sched().
@@ -3496,6 +3512,7 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
op = rcu_dereference_raw(op->next);
};
preempt_enable_notrace();
+ trace_recursion_clear(TRACE_INTERNAL_BIT);
}
static void clear_ftrace_swapper(void)
@@ -3799,7 +3816,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
ret = __register_ftrace_function(ops);
if (!ret)
- ftrace_startup(ops, 0);
+ ret = ftrace_startup(ops, 0);
out_unlock:
@@ -4045,7 +4062,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_return = retfunc;
ftrace_graph_entry = entryfunc;
- ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f..b0c7aa40794 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
"HC[%lu]:SC[%lu]:NMI[%lu]\n",
- current->trace_recursion,
+ trace_recursion_buffer(),
hardirq_count() >> HARDIRQ_SHIFT,
softirq_count() >> SOFTIRQ_SHIFT,
in_nmi());
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
static inline int trace_recursive_lock(void)
{
- current->trace_recursion++;
+ trace_recursion_inc();
- if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
return 0;
trace_recursive_fail();
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void)
static inline void trace_recursive_unlock(void)
{
- WARN_ON_ONCE(!current->trace_recursion);
+ WARN_ON_ONCE(!trace_recursion_buffer());
- current->trace_recursion--;
+ trace_recursion_dec();
}
#else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b69c4bd306..229f8591f61 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -784,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT (1<<11)
+#define TRACE_GLOBAL_BIT (1<<12)
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe11034135..686ec399f2a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata =
static __init void event_trace_self_test_with_function(void)
{
- register_ftrace_function(&trace_ops);
+ int ret;
+ ret = register_ftrace_function(&trace_ops);
+ if (WARN_ON(ret < 0)) {
+ pr_info("Failed to enable function tracer for event tests\n");
+ return;
+ }
pr_info("Running tests again, along with the function tracer\n");
event_trace_self_tests();
unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cf535ccedc8..e37de492a9e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
}
EXPORT_SYMBOL(ftrace_print_symbols_seq);
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+ const struct trace_print_flags_u64 *symbol_array)
+{
+ int i;
+ const char *ret = p->buffer + p->len;
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (!p->len)
+ trace_seq_printf(p, "0x%llx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
+
const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eab..bff131b9510 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
put_user_ns(ns->user_ns);
kfree(ns);
}
+
+static void *utsns_get(struct task_struct *task)
+{
+ struct uts_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy) {
+ ns = nsproxy->uts_ns;
+ get_uts_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void utsns_put(void *ns)
+{
+ put_uts_ns(ns);
+}
+
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+ get_uts_ns(ns);
+ put_uts_ns(nsproxy->uts_ns);
+ nsproxy->uts_ns = ns;
+ return 0;
+}
+
+const struct proc_ns_operations utsns_operations = {
+ .name = "uts",
+ .type = CLONE_NEWUTS,
+ .get = utsns_get,
+ .put = utsns_put,
+ .install = utsns_install,
+};
+
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7daa4b072e9..3d0c56ad479 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -415,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
{
struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
WARN_ON(per_cpu(softlockup_watchdog, cpu));
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
-
- return 0;
}
static int watchdog_enable(int cpu)
@@ -542,17 +540,16 @@ static int __cpuinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
- int err = 0;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- err = watchdog_prepare_cpu(hotcpu);
+ watchdog_prepare_cpu(hotcpu);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
if (watchdog_enabled)
- err = watchdog_enable(hotcpu);
+ watchdog_enable(hotcpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e3378e8d3a5..0400553f0d0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2866,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
}
}
- /* just in case, make sure it's actually aligned
- * - this is affected by PERCPU() alignment in vmlinux.lds.S
- */
+ /* just in case, make sure it's actually aligned */
BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
return wq->cpu_wq.v ? 0 : -ENOMEM;
}