aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cred.c293
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c53
-rw-r--r--kernel/futex.c28
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/irq/manage.c27
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c680
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched_cpupri.c15
-rw-r--r--kernel/sched_fair.c32
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/clockevents.c16
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/ftrace.c21
-rw-r--r--kernel/trace/ring_buffer.c15
-rw-r--r--kernel/trace/trace.c25
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c20
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/wait.c5
-rw-r--r--kernel/workqueue.c7
38 files changed, 993 insertions, 383 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 9f3391090b3..9a4715a2f6b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
u64 run_time;
struct timespec uptime;
struct tty_struct *tty;
+ const struct cred *orig_cred;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct, file))
- return;
+ goto out;
/*
* Fill the accounting struct with the needed info as recorded
@@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+out:
+ revert_creds(orig_cred);
}
/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7..c7ece8f027f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
static struct file_operations proc_cgroupstats_operations;
static struct backing_dev_info cgroup_backing_dev_info = {
+ .name = "cgroup",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d61..006fcab009d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
#include <linux/cn_proc.h>
#include "cred-internals.h"
+#if 0
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#else
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+#define kdebug(FMT, ...) \
+ no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#endif
+
static struct kmem_cache *cred_jar;
/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ .subscribers = ATOMIC_INIT(2),
+ .magic = CRED_MAGIC,
+#endif
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_INIT_INH_SET,
.cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
#endif
};
+static inline void set_cred_subscribers(struct cred *cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ atomic_set(&cred->subscribers, n);
+#endif
+}
+
+static inline int read_cred_subscribers(const struct cred *cred)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ return atomic_read(&cred->subscribers);
+#else
+ return 0;
+#endif
+}
+
+static inline void alter_cred_subscribers(const struct cred *_cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ struct cred *cred = (struct cred *) _cred;
+
+ atomic_add(n, &cred->subscribers);
+#endif
+}
+
/*
* Dispose of the shared task group credentials
*/
@@ -85,9 +126,22 @@ static void put_cred_rcu(struct rcu_head *rcu)
{
struct cred *cred = container_of(rcu, struct cred, rcu);
+ kdebug("put_cred_rcu(%p)", cred);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ if (cred->magic != CRED_MAGIC_DEAD ||
+ atomic_read(&cred->usage) != 0 ||
+ read_cred_subscribers(cred) != 0)
+ panic("CRED: put_cred_rcu() sees %p with"
+ " mag %x, put %p, usage %d, subscr %d\n",
+ cred, cred->magic, cred->put_addr,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+#else
if (atomic_read(&cred->usage) != 0)
panic("CRED: put_cred_rcu() sees %p with usage %d\n",
cred, atomic_read(&cred->usage));
+#endif
security_cred_free(cred);
key_put(cred->thread_keyring);
@@ -106,12 +160,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
+ kdebug("__put_cred(%p{%d,%d})", cred,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+
BUG_ON(atomic_read(&cred->usage) != 0);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(cred) != 0);
+ cred->magic = CRED_MAGIC_DEAD;
+ cred->put_addr = __builtin_return_address(0);
+#endif
+ BUG_ON(cred == current->cred);
+ BUG_ON(cred == current->real_cred);
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
+/*
+ * Clean up a task's credentials when it exits
+ */
+void exit_creds(struct task_struct *tsk)
+{
+ struct cred *cred;
+
+ kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ cred = (struct cred *) tsk->real_cred;
+ tsk->real_cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+
+ cred = (struct cred *) tsk->cred;
+ tsk->cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+
+ cred = (struct cred *) tsk->replacement_session_keyring;
+ if (cred) {
+ tsk->replacement_session_keyring = NULL;
+ validate_creds(cred);
+ put_cred(cred);
+ }
+}
+
+/*
+ * Allocate blank credentials, such that the credentials can be filled in at a
+ * later date without risk of ENOMEM.
+ */
+struct cred *cred_alloc_blank(void)
+{
+ struct cred *new;
+
+ new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+#ifdef CONFIG_KEYS
+ new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
+ if (!new->tgcred) {
+ kfree(new);
+ return NULL;
+ }
+ atomic_set(&new->tgcred->usage, 1);
+#endif
+
+ atomic_set(&new->usage, 1);
+
+ if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ goto error;
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
+ return new;
+
+error:
+ abort_creds(new);
+ return NULL;
+}
+
/**
* prepare_creds - Prepare a new set of credentials for modification
*
@@ -132,16 +264,19 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- BUG_ON(atomic_read(&task->real_cred->usage) < 1);
+ validate_process_creds();
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
+ kdebug("prepare_creds() alloc %p", new);
+
old = task->cred;
memcpy(new, old, sizeof(struct cred));
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
@@ -157,6 +292,7 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
+ validate_creds(new);
return new;
error:
@@ -229,9 +365,12 @@ struct cred *prepare_usermodehelper_creds(void)
if (!new)
return NULL;
+ kdebug("prepare_usermodehelper_creds() alloc %p", new);
+
memcpy(new, &init_cred, sizeof(struct cred));
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
@@ -250,6 +389,7 @@ struct cred *prepare_usermodehelper_creds(void)
#endif
if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
goto error;
+ validate_creds(new);
BUG_ON(atomic_read(&new->usage) != 1);
return new;
@@ -286,6 +426,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
) {
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
+ alter_cred_subscribers(p->cred, 2);
+ kdebug("share_creds(%p{%d,%d})",
+ p->cred, atomic_read(&p->cred->usage),
+ read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes);
return 0;
}
@@ -331,6 +475,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
+ alter_cred_subscribers(new, 2);
+ validate_creds(new);
return 0;
error_put:
@@ -355,13 +501,20 @@ error_put:
int commit_creds(struct cred *new)
{
struct task_struct *task = current;
- const struct cred *old;
+ const struct cred *old = task->real_cred;
- BUG_ON(task->cred != task->real_cred);
- BUG_ON(atomic_read(&task->real_cred->usage) < 2);
+ kdebug("commit_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ BUG_ON(task->cred != old);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(old) < 2);
+ validate_creds(old);
+ validate_creds(new);
+#endif
BUG_ON(atomic_read(&new->usage) < 1);
- old = task->real_cred;
security_commit_creds(new, old);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +543,14 @@ int commit_creds(struct cred *new)
* cheaply with the new uid cache, so if it matters
* we should be checking for it. -DaveM
*/
+ alter_cred_subscribers(new, 2);
if (new->user != old->user)
atomic_inc(&new->user->processes);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
atomic_dec(&old->user->processes);
+ alter_cred_subscribers(old, -2);
sched_switch_user(task);
@@ -428,6 +583,13 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
+ kdebug("abort_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(new) != 0);
+#endif
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
@@ -444,7 +606,20 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- rcu_assign_pointer(current->cred, get_cred(new));
+ kdebug("override_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ validate_creds(old);
+ validate_creds(new);
+ get_cred(new);
+ alter_cred_subscribers(new, 1);
+ rcu_assign_pointer(current->cred, new);
+ alter_cred_subscribers(old, -1);
+
+ kdebug("override_creds() = %p{%d,%d}", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
return old;
}
EXPORT_SYMBOL(override_creds);
@@ -460,7 +635,15 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
+ kdebug("revert_creds(%p{%d,%d})", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
+
+ validate_creds(old);
+ validate_creds(override);
+ alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
+ alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
@@ -502,11 +685,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
+ kdebug("prepare_kernel_cred() alloc %p", new);
+
if (daemon)
old = get_task_cred(daemon);
else
old = get_cred(&init_cred);
+ validate_creds(old);
+
*new = *old;
get_uid(new->user);
get_group_info(new->group_info);
@@ -526,7 +713,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
goto error;
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
put_cred(old);
+ validate_creds(new);
return new;
error:
@@ -589,3 +778,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+
+/*
+ * dump invalid credentials
+ */
+static void dump_invalid_creds(const struct cred *cred, const char *label,
+ const struct task_struct *tsk)
+{
+ printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+ label, cred,
+ cred == &init_cred ? "[init]" : "",
+ cred == tsk->real_cred ? "[real]" : "",
+ cred == tsk->cred ? "[eff]" : "");
+ printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+ cred->magic, cred->put_addr);
+ printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+ printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+ cred->uid, cred->euid, cred->suid, cred->fsuid);
+ printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+ cred->gid, cred->egid, cred->sgid, cred->fsgid);
+#ifdef CONFIG_SECURITY
+ printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+ if ((unsigned long) cred->security >= PAGE_SIZE &&
+ (((unsigned long) cred->security & 0xffffff00) !=
+ (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
+ printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+ ((u32*)cred->security)[0],
+ ((u32*)cred->security)[1]);
+#endif
+}
+
+/*
+ * report use of invalid credentials
+ */
+void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+{
+ printk(KERN_ERR "CRED: Invalid credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ dump_invalid_creds(cred, "Specified", current);
+ BUG();
+}
+EXPORT_SYMBOL(__invalid_creds);
+
+/*
+ * check the credentials on a process
+ */
+void __validate_process_creds(struct task_struct *tsk,
+ const char *file, unsigned line)
+{
+ if (tsk->cred == tsk->real_cred) {
+ if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ } else {
+ if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
+ read_cred_subscribers(tsk->cred) < 1 ||
+ creds_are_invalid(tsk->real_cred) ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ }
+ return;
+
+invalid_creds:
+ printk(KERN_ERR "CRED: Invalid process credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+
+ dump_invalid_creds(tsk->real_cred, "Real", tsk);
+ if (tsk->cred != tsk->real_cred)
+ dump_invalid_creds(tsk->cred, "Effective", tsk);
+ else
+ printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+ BUG();
+}
+EXPORT_SYMBOL(__validate_process_creds);
+
+/*
+ * check creds for do_exit()
+ */
+void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+ kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+ tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ __validate_process_creds(tsk, __FILE__, __LINE__);
+}
+
+#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733..c98ff7a8025 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -901,6 +901,8 @@ NORET_TYPE void do_exit(long code)
tracehook_report_exit(&code);
+ validate_creds_for_do_exit(tsk);
+
/*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
@@ -1009,6 +1011,8 @@ NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
+ validate_creds_for_do_exit(tsk);
+
preempt_disable();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
diff --git a/kernel/fork.c b/kernel/fork.c
index 29b532e718f..aab8579c609 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -152,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
- put_cred(tsk->real_cred);
- put_cred(tsk->cred);
+ exit_creds(tsk);
delayacct_tsk_free(tsk);
if (!profile_handoff_task(tsk))
@@ -426,7 +425,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
- mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0;
mm->core_state = NULL;
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
@@ -568,18 +566,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* the value intact in a core dump, and to save the unnecessary
* trouble otherwise. Userland only wants this done for a sys_exit.
*/
- if (tsk->clear_child_tid
- && !(tsk->flags & PF_SIGNALED)
- && atomic_read(&mm->mm_users) > 1) {
- u32 __user * tidptr = tsk->clear_child_tid;
+ if (tsk->clear_child_tid) {
+ if (!(tsk->flags & PF_SIGNALED) &&
+ atomic_read(&mm->mm_users) > 1) {
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tsk->clear_child_tid);
+ sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0);
+ }
tsk->clear_child_tid = NULL;
-
- /*
- * We don't check the error code - if userspace has
- * not set up a proper pointer then tough luck.
- */
- put_user(0, tidptr);
- sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
}
}
@@ -816,11 +814,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
- if (clone_flags & CLONE_THREAD) {
- atomic_inc(&current->signal->count);
- atomic_inc(&current->signal->live);
+ if (clone_flags & CLONE_THREAD)
return 0;
- }
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
@@ -878,16 +873,6 @@ void __cleanup_signal(struct signal_struct *sig)
kmem_cache_free(signal_cachep, sig);
}
-static void cleanup_signal(struct task_struct *tsk)
-{
- struct signal_struct *sig = tsk->signal;
-
- atomic_dec(&sig->live);
-
- if (atomic_dec_and_test(&sig->count))
- __cleanup_signal(sig);
-}
-
static void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
@@ -1240,6 +1225,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
if (clone_flags & CLONE_THREAD) {
+ atomic_inc(&current->signal->count);
+ atomic_inc(&current->signal->live);
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
}
@@ -1269,6 +1256,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ perf_counter_fork(p);
return p;
bad_fork_free_pid:
@@ -1282,7 +1270,8 @@ bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
- cleanup_signal(p);
+ if (!(clone_flags & CLONE_THREAD))
+ __cleanup_signal(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
@@ -1307,8 +1296,7 @@ bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
- put_cred(p->real_cred);
- put_cred(p->cred);
+ exit_creds(p);
bad_fork_free:
free_task(p);
fork_out:
@@ -1410,9 +1398,6 @@ long do_fork(unsigned long clone_flags,
init_completion(&vfork);
}
- if (!(clone_flags & CLONE_THREAD))
- perf_counter_fork(p);
-
audit_finish_fork(p);
tracehook_report_clone(regs, clone_flags, nr, p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 0672ff88f15..e18cfbdc719 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
* q: the futex_q
* key: the key of the requeue target futex
+ * hb: the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
* target futex if it is uncontended or via a lock steal. Set the futex_q key
* to the requeue target futex so the waiter can detect the wakeup on the right
* futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Must be called with the q->lock_ptr held.
+ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later. Must be called
+ * with both q->lock_ptr and hb->lock held.
*/
static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
{
drop_futex_key_refs(&q->key);
get_futex_key_refs(key);
@@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
+ q->lock_ptr = &hb->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
if (ret == 1)
- requeue_pi_wake_futex(top_waiter, key2);
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
return ret;
}
@@ -1247,8 +1256,15 @@ retry_private:
if (!match_futex(&this->key, &key1))
continue;
- WARN_ON(!requeue_pi && this->rt_waiter);
- WARN_ON(requeue_pi && !this->rt_waiter);
+ /*
+ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter)) {
+ ret = -EINVAL;
+ break;
+ }
/*
* Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1273,7 +1289,7 @@ retry_private:
this->task, 1);
if (ret == 1) {
/* We got the lock. */
- requeue_pi_wake_futex(this, &key2);
+ requeue_pi_wake_futex(this, &key2, hb2);
continue;
} else if (ret) {
/* -EDEADLK */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee2..235716556bf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET)) {
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (get_compat_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 61c679db468..0ec9ed83173 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -607,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
get_task_struct(t);
new->thread = t;
- wake_up_process(t);
}
/*
@@ -690,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
(int)(new->flags & IRQF_TRIGGER_MASK));
}
+ new->irq = irq;
*old_ptr = new;
/* Reset broken irq detection when installing new handler */
@@ -707,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
spin_unlock_irqrestore(&desc->lock, flags);
- new->irq = irq;
+ /*
+ * Strictly no need to wake it up, but hung_task complains
+ * when no hard interrupt wakes the thread up.
+ */
+ if (new->thread)
+ wake_up_process(new->thread);
+
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
@@ -761,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action, **action_ptr;
- struct task_struct *irqthread;
unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -809,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
desc->chip->disable(irq);
}
- irqthread = action->thread;
- action->thread = NULL;
-
spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -819,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Make sure it's not being used on another CPU: */
synchronize_irq(irq);
- if (irqthread) {
- if (!test_bit(IRQTF_DIED, &action->thread_flags))
- kthread_stop(irqthread);
- put_task_struct(irqthread);
- }
-
#ifdef CONFIG_DEBUG_SHIRQ
/*
* It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -840,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
local_irq_restore(flags);
}
#endif
+
+ if (action->thread) {
+ if (!test_bit(IRQTF_DIED, &action->thread_flags))
+ kthread_stop(action->thread);
+ put_task_struct(action->thread);
+ }
+
return action;
}
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf..3fd30197da2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
{
- /* those all static, do move them */
- if (desc->irq < NR_IRQS_LEGACY)
+ /* those static or target node is -1, do not move them */
+ if (desc->irq < NR_IRQS_LEGACY || node == -1)
return desc;
if (desc->node != node)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 385c31a1bdb..4e8cae2e914 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -78,6 +78,10 @@ int __request_module(bool wait, const char *fmt, ...)
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
+ ret = security_kernel_module_request();
+ if (ret)
+ return ret;
+
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
@@ -462,6 +466,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
int retval = 0;
BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+ validate_creds(sub_info->cred);
helper_lock();
if (sub_info->path[0] == '\0')
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c..e94caa666db 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
&proc_lockdep_stats_operations);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+ &proc_lock_stat_operations);
#endif
return 0;
diff --git a/kernel/module.c b/kernel/module.c
index fd141140355..2d537186191 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -909,16 +909,18 @@ void __symbol_put(const char *symbol)
}
EXPORT_SYMBOL(__symbol_put);
+/* Note this assumes addr is a function, which it currently always is. */
void symbol_put_addr(void *addr)
{
struct module *modaddr;
+ unsigned long a = (unsigned long)dereference_function_descriptor(addr);
- if (core_kernel_text((unsigned long)addr))
+ if (core_kernel_text(a))
return;
/* module_text_address is safe here: we're supposed to have reference
* to module from symbol_get, so it can't go away. */
- modaddr = __module_text_address((unsigned long)addr);
+ modaddr = __module_text_address(a);
BUG_ON(!modaddr);
module_put(modaddr);
}
@@ -1272,6 +1274,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
struct module_notes_attrs *notes_attrs;
struct bin_attribute *nattr;
+ /* failed to create section attributes, so can't create notes */
+ if (!mod->sect_attrs)
+ return;
+
/* Count notes sections and allocate structures. */
notes = 0;
for (i = 0; i < nsect; i++)
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72..512ab73b0ca 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
*/
void oops_enter(void)
{
+ tracing_off();
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 95093104195..d7cbc579fc8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
static atomic_t nr_counters __read_mostly;
static atomic_t nr_mmap_counters __read_mostly;
static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
/*
* perf counter paranoia level:
@@ -49,7 +50,7 @@ static atomic_t nr_comm_counters __read_mostly;
* 1 - disallow cpu counters to unpriv
* 2 - disallow kernel profiling to unpriv
*/
-int sysctl_perf_counter_paranoid __read_mostly;
+int sysctl_perf_counter_paranoid __read_mostly = 1;
static inline bool perf_paranoid_cpu(void)
{
@@ -87,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
void __weak hw_perf_enable(void) { barrier(); }
void __weak hw_perf_counter_setup(int cpu) { barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
int __weak
hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -305,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
return;
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ if (counter->pending_disable) {
+ counter->pending_disable = 0;
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
counter->tstamp_stopped = ctx->time;
counter->pmu->disable(counter);
counter->oncpu = -1;
@@ -1103,7 +1109,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,
__perf_counter_sync_stat(counter, next_counter);
counter = list_next_entry(counter, event_entry);
- next_counter = list_next_entry(counter, event_entry);
+ next_counter = list_next_entry(next_counter, event_entry);
}
}
@@ -1497,10 +1503,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
*/
static void __perf_counter_read(void *info)
{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
unsigned long flags;
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * counter->count would have been updated to a recent sample
+ * when the counter was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
local_irq_save(flags);
if (ctx->is_active)
update_context_time(ctx);
@@ -1654,6 +1671,8 @@ static void free_counter(struct perf_counter *counter)
atomic_dec(&nr_mmap_counters);
if (counter->attr.comm)
atomic_dec(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_dec(&nr_task_counters);
}
if (counter->destroy)
@@ -1688,14 +1707,133 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += counter->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+
+ return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
+{
+ struct perf_counter *child;
+ u64 total = 0;
+
+ total += perf_counter_read(counter);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ total += perf_counter_read(child);
+
+ return total;
+}
+
+static int perf_counter_read_entry(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ int n = 0, count = 0;
+ u64 values[2];
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ count = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, count))
+ return -EFAULT;
+
+ return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ struct perf_counter *leader = counter->group_leader, *sub;
+ int n = 0, size = 0, err = -EFAULT;
+ u64 values[3];
+
+ values[n++] = 1 + leader->nr_siblings;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, size))
+ return -EFAULT;
+
+ err = perf_counter_read_entry(leader, read_format, buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ err = perf_counter_read_entry(sub, read_format,
+ buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+ }
+
+ return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ if (copy_to_user(buf, values, n * sizeof(u64)))
+ return -EFAULT;
+
+ return n * sizeof(u64);
+}
+
/*
* Read the performance counter - simple non blocking version for now
*/
static ssize_t
perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
{
- u64 values[4];
- int n;
+ u64 read_format = counter->attr.read_format;
+ int ret;
/*
* Return end-of-file for a read on a counter that is in
@@ -1705,28 +1843,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
if (counter->state == PERF_COUNTER_STATE_ERROR)
return 0;
+ if (count < perf_counter_read_size(counter))
+ return -ENOSPC;
+
WARN_ON_ONCE(counter->ctx->parent_ctx);
mutex_lock(&counter->child_mutex);
- values[0] = perf_counter_read(counter);
- n = 1;
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = counter->total_time_enabled +
- atomic64_read(&counter->child_total_time_enabled);
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = counter->total_time_running +
- atomic64_read(&counter->child_total_time_running);
- if (counter->attr.read_format & PERF_FORMAT_ID)
- values[n++] = primary_counter_id(counter);
+ if (read_format & PERF_FORMAT_GROUP)
+ ret = perf_counter_read_group(counter, read_format, buf);
+ else
+ ret = perf_counter_read_one(counter, read_format, buf);
mutex_unlock(&counter->child_mutex);
- if (count < n * sizeof(u64))
- return -EINVAL;
- count = n * sizeof(u64);
-
- if (copy_to_user(buf, values, count))
- return -EFAULT;
-
- return count;
+ return ret;
}
static ssize_t
@@ -1891,6 +2019,10 @@ int perf_counter_task_disable(void)
return 0;
}
+#ifndef PERF_COUNTER_INDEX_OFFSET
+# define PERF_COUNTER_INDEX_OFFSET 0
+#endif
+
static int perf_counter_index(struct perf_counter *counter)
{
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
@@ -2230,7 +2362,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
if (counter->pending_disable) {
counter->pending_disable = 0;
- perf_counter_disable(counter);
+ __perf_counter_disable(counter);
}
if (counter->pending_wakeup) {
@@ -2615,7 +2747,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
return task_pid_nr_ns(p, counter->ns);
}
-static void perf_counter_output(struct perf_counter *counter, int nmi,
+static void perf_output_read_one(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ u64 read_format = counter->attr.read_format;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = atomic64_read(&counter->count);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ struct perf_counter *leader = counter->group_leader, *sub;
+ u64 read_format = counter->attr.read_format;
+ u64 values[5];
+ int n = 0;
+
+ values[n++] = 1 + leader->nr_siblings;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = leader->total_time_enabled;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = leader->total_time_running;
+
+ if (leader != counter)
+ leader->pmu->read(leader);
+
+ values[n++] = atomic64_read(&leader->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(leader);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ n = 0;
+
+ if (sub != counter)
+ sub->pmu->read(sub);
+
+ values[n++] = atomic64_read(&sub->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(sub);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+ }
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ if (counter->attr.read_format & PERF_FORMAT_GROUP)
+ perf_output_read_group(handle, counter);
+ else
+ perf_output_read_one(handle, counter);
+}
+
+void perf_counter_output(struct perf_counter *counter, int nmi,
struct perf_sample_data *data)
{
int ret;
@@ -2626,10 +2831,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
struct {
u32 pid, tid;
} tid_entry;
- struct {
- u64 id;
- u64 counter;
- } group_entry;
struct perf_callchain_entry *callchain = NULL;
int callchain_size = 0;
u64 time;
@@ -2684,10 +2885,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
if (sample_type & PERF_SAMPLE_PERIOD)
header.size += sizeof(u64);
- if (sample_type & PERF_SAMPLE_GROUP) {
- header.size += sizeof(u64) +
- counter->nr_siblings * sizeof(group_entry);
- }
+ if (sample_type & PERF_SAMPLE_READ)
+ header.size += perf_counter_read_size(counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
callchain = perf_callchain(data->regs);
@@ -2699,6 +2898,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
header.size += sizeof(u64);
}
+ if (sample_type & PERF_SAMPLE_RAW) {
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header.size += size;
+ }
+
ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
if (ret)
return;
@@ -2732,26 +2943,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
if (sample_type & PERF_SAMPLE_PERIOD)
perf_output_put(&handle, data->period);
- /*
- * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
- */
- if (sample_type & PERF_SAMPLE_GROUP) {
- struct perf_counter *leader, *sub;
- u64 nr = counter->nr_siblings;
-
- perf_output_put(&handle, nr);
-
- leader = counter->group_leader;
- list_for_each_entry(sub, &leader->sibling_list, list_entry) {
- if (sub != counter)
- sub->pmu->read(sub);
-
- group_entry.id = primary_counter_id(sub);
- group_entry.counter = atomic64_read(&sub->count);
-
- perf_output_put(&handle, group_entry);
- }
- }
+ if (sample_type & PERF_SAMPLE_READ)
+ perf_output_read(&handle, counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
if (callchain)
@@ -2762,6 +2955,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
}
}
+ if (sample_type & PERF_SAMPLE_RAW) {
+ if (data->raw) {
+ perf_output_put(&handle, data->raw->size);
+ perf_output_copy(&handle, data->raw->data, data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(&handle, raw);
+ }
+ }
+
perf_output_end(&handle);
}
@@ -2774,8 +2983,6 @@ struct perf_read_event {
u32 pid;
u32 tid;
- u64 value;
- u64 format[3];
};
static void
@@ -2787,80 +2994,74 @@ perf_counter_read_event(struct perf_counter *counter,
.header = {
.type = PERF_EVENT_READ,
.misc = 0,
- .size = sizeof(event) - sizeof(event.format),
+ .size = sizeof(event) + perf_counter_read_size(counter),
},
.pid = perf_counter_pid(counter, task),
.tid = perf_counter_tid(counter, task),
- .value = atomic64_read(&counter->count),
};
- int ret, i = 0;
-
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- event.header.size += sizeof(u64);
- event.format[i++] = counter->total_time_enabled;
- }
-
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- event.header.size += sizeof(u64);
- event.format[i++] = counter->total_time_running;
- }
-
- if (counter->attr.read_format & PERF_FORMAT_ID) {
- event.header.size += sizeof(u64);
- event.format[i++] = primary_counter_id(counter);
- }
+ int ret;
ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
if (ret)
return;
- perf_output_copy(&handle, &event, event.header.size);
+ perf_output_put(&handle, event);
+ perf_output_read(&handle, counter);
+
perf_output_end(&handle);
}
/*
- * fork tracking
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
*/
-struct perf_fork_event {
- struct task_struct *task;
+struct perf_task_event {
+ struct task_struct *task;
+ struct perf_counter_context *task_ctx;
struct {
struct perf_event_header header;
u32 pid;
u32 ppid;
+ u32 tid;
+ u32 ptid;
} event;
};
-static void perf_counter_fork_output(struct perf_counter *counter,
- struct perf_fork_event *fork_event)
+static void perf_counter_task_output(struct perf_counter *counter,
+ struct perf_task_event *task_event)
{
struct perf_output_handle handle;
- int size = fork_event->event.header.size;
- struct task_struct *task = fork_event->task;
+ int size = task_event->event.header.size;
+ struct task_struct *task = task_event->task;
int ret = perf_output_begin(&handle, counter, size, 0, 0);
if (ret)
return;
- fork_event->event.pid = perf_counter_pid(counter, task);
- fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+ task_event->event.pid = perf_counter_pid(counter, task);
+ task_event->event.ppid = perf_counter_pid(counter, current);
- perf_output_put(&handle, fork_event->event);
+ task_event->event.tid = perf_counter_tid(counter, task);
+ task_event->event.ptid = perf_counter_tid(counter, current);
+
+ perf_output_put(&handle, task_event->event);
perf_output_end(&handle);
}
-static int perf_counter_fork_match(struct perf_counter *counter)
+static int perf_counter_task_match(struct perf_counter *counter)
{
- if (counter->attr.comm || counter->attr.mmap)
+ if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
return 1;
return 0;
}
-static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
- struct perf_fork_event *fork_event)
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+ struct perf_task_event *task_event)
{
struct perf_counter *counter;
@@ -2869,54 +3070,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
rcu_read_lock();
list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
- if (perf_counter_fork_match(counter))
- perf_counter_fork_output(counter, fork_event);
+ if (perf_counter_task_match(counter))
+ perf_counter_task_output(counter, task_event);
}
rcu_read_unlock();
}
-static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+static void perf_counter_task_event(struct perf_task_event *task_event)
{
struct perf_cpu_context *cpuctx;
- struct perf_counter_context *ctx;
+ struct perf_counter_context *ctx = task_event->task_ctx;
cpuctx = &get_cpu_var(perf_cpu_context);
- perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+ perf_counter_task_ctx(&cpuctx->ctx, task_event);
put_cpu_var(perf_cpu_context);
rcu_read_lock();
- /*
- * doesn't really matter which of the child contexts the
- * events ends up in.
- */
- ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (!ctx)
+ ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
if (ctx)
- perf_counter_fork_ctx(ctx, fork_event);
+ perf_counter_task_ctx(ctx, task_event);
rcu_read_unlock();
}
-void perf_counter_fork(struct task_struct *task)
+static void perf_counter_task(struct task_struct *task,
+ struct perf_counter_context *task_ctx,
+ int new)
{
- struct perf_fork_event fork_event;
+ struct perf_task_event task_event;
if (!atomic_read(&nr_comm_counters) &&
- !atomic_read(&nr_mmap_counters))
+ !atomic_read(&nr_mmap_counters) &&
+ !atomic_read(&nr_task_counters))
return;
- fork_event = (struct perf_fork_event){
- .task = task,
- .event = {
+ task_event = (struct perf_task_event){
+ .task = task,
+ .task_ctx = task_ctx,
+ .event = {
.header = {
- .type = PERF_EVENT_FORK,
+ .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
.misc = 0,
- .size = sizeof(fork_event.event),
+ .size = sizeof(task_event.event),
},
/* .pid */
/* .ppid */
+ /* .tid */
+ /* .ptid */
},
};
- perf_counter_fork_event(&fork_event);
+ perf_counter_task_event(&task_event);
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+ perf_counter_task(task, NULL, 1);
}
/*
@@ -3305,125 +3514,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
* Generic software counter infrastructure
*/
-static void perf_swcounter_update(struct perf_counter *counter)
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
- u64 prev, now;
- s64 delta;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
again:
- prev = atomic64_read(&hwc->prev_count);
- now = atomic64_read(&hwc->count);
- if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
- goto again;
+ old = val = atomic64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
- delta = now - prev;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
- atomic64_add(delta, &counter->count);
- atomic64_sub(delta, &hwc->period_left);
+ return nr;
}
-static void perf_swcounter_set_period(struct perf_counter *counter)
+static void perf_swcounter_overflow(struct perf_counter *counter,
+ int nmi, struct perf_sample_data *data)
{
struct hw_perf_counter *hwc = &counter->hw;
- s64 left = atomic64_read(&hwc->period_left);
- s64 period = hwc->sample_period;
+ u64 overflow;
- if (unlikely(left <= -period)) {
- left = period;
- atomic64_set(&hwc->period_left, left);
- hwc->last_period = period;
- }
+ data->period = counter->hw.last_period;
+ overflow = perf_swcounter_set_period(counter);
- if (unlikely(left <= 0)) {
- left += period;
- atomic64_add(period, &hwc->period_left);
- hwc->last_period = period;
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
- atomic64_set(&hwc->prev_count, -left);
- atomic64_set(&hwc->count, -left);
+ for (; overflow; overflow--) {
+ if (perf_counter_overflow(counter, nmi, data)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ }
}
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
{
- enum hrtimer_restart ret = HRTIMER_RESTART;
- struct perf_sample_data data;
- struct perf_counter *counter;
- u64 period;
-
- counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
- counter->pmu->read(counter);
-
- data.addr = 0;
- data.regs = get_irq_regs();
/*
- * In case we exclude kernel IPs or are somehow not in interrupt
- * context, provide the next best thing, the user IP.
+ * Nothing to do, we already reset hwc->interrupts.
*/
- if ((counter->attr.exclude_kernel || !data.regs) &&
- !counter->attr.exclude_user)
- data.regs = task_pt_regs(current);
+}
- if (data.regs) {
- if (perf_counter_overflow(counter, 0, &data))
- ret = HRTIMER_NORESTART;
- }
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+ int nmi, struct perf_sample_data *data)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
- period = max_t(u64, 10000, counter->hw.sample_period);
- hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+ atomic64_add(nr, &counter->count);
- return ret;
-}
+ if (!hwc->sample_period)
+ return;
-static void perf_swcounter_overflow(struct perf_counter *counter,
- int nmi, struct perf_sample_data *data)
-{
- data->period = counter->hw.last_period;
+ if (!data->regs)
+ return;
- perf_swcounter_update(counter);
- perf_swcounter_set_period(counter);
- if (perf_counter_overflow(counter, nmi, data))
- /* soft-disable the counter */
- ;
+ if (!atomic64_add_negative(nr, &hwc->period_left))
+ perf_swcounter_overflow(counter, nmi, data);
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
{
- struct perf_counter_context *ctx;
- unsigned long flags;
- int count;
-
+ /*
+ * The counter is active, we're good!
+ */
if (counter->state == PERF_COUNTER_STATE_ACTIVE)
return 1;
+ /*
+ * The counter is off/error, not counting.
+ */
if (counter->state != PERF_COUNTER_STATE_INACTIVE)
return 0;
/*
- * If the counter is inactive, it could be just because
- * its task is scheduled out, or because it's in a group
- * which could not go on the PMU. We want to count in
- * the first case but not the second. If the context is
- * currently active then an inactive software counter must
- * be the second case. If it's not currently active then
- * we need to know whether the counter was active when the
- * context was last active, which we can determine by
- * comparing counter->tstamp_stopped with ctx->time.
- *
- * We are within an RCU read-side critical section,
- * which protects the existence of *ctx.
+ * The counter is inactive, if the context is active
+ * we're part of a group that didn't make it on the 'pmu',
+ * not counting.
*/
- ctx = counter->ctx;
- spin_lock_irqsave(&ctx->lock, flags);
- count = 1;
- /* Re-check state now we have the lock */
- if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
- counter->ctx->is_active ||
- counter->tstamp_stopped < ctx->time)
- count = 0;
- spin_unlock_irqrestore(&ctx->lock, flags);
- return count;
+ if (counter->ctx->is_active)
+ return 0;
+
+ /*
+ * We're inactive and the context is too, this means the
+ * task is scheduled out, we're counting events that happen
+ * to us, like migration events.
+ */
+ return 1;
}
static int perf_swcounter_match(struct perf_counter *counter,
@@ -3449,15 +3644,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
return 1;
}
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
- int nmi, struct perf_sample_data *data)
-{
- int neg = atomic64_add_negative(nr, &counter->hw.count);
-
- if (counter->hw.sample_period && !neg && data->regs)
- perf_swcounter_overflow(counter, nmi, data);
-}
-
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
@@ -3536,27 +3722,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
static void perf_swcounter_read(struct perf_counter *counter)
{
- perf_swcounter_update(counter);
}
static int perf_swcounter_enable(struct perf_counter *counter)
{
- perf_swcounter_set_period(counter);
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (hwc->sample_period) {
+ hwc->last_period = hwc->sample_period;
+ perf_swcounter_set_period(counter);
+ }
return 0;
}
static void perf_swcounter_disable(struct perf_counter *counter)
{
- perf_swcounter_update(counter);
}
static const struct pmu perf_ops_generic = {
.enable = perf_swcounter_enable,
.disable = perf_swcounter_disable,
.read = perf_swcounter_read,
+ .unthrottle = perf_swcounter_unthrottle,
};
/*
+ * hrtimer based swcounter callback
+ */
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct perf_counter *counter;
+ u64 period;
+
+ counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+ counter->pmu->read(counter);
+
+ data.addr = 0;
+ data.regs = get_irq_regs();
+ /*
+ * In case we exclude kernel IPs or are somehow not in interrupt
+ * context, provide the next best thing, the user IP.
+ */
+ if ((counter->attr.exclude_kernel || !data.regs) &&
+ !counter->attr.exclude_user)
+ data.regs = task_pt_regs(current);
+
+ if (data.regs) {
+ if (perf_counter_overflow(counter, 0, &data))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, counter->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+/*
* Software counter: cpu wall time clock
*/
@@ -3673,17 +3898,24 @@ static const struct pmu perf_ops_task_clock = {
};
#ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+ int entry_size)
{
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
struct perf_sample_data data = {
.regs = get_irq_regs(),
- .addr = 0,
+ .addr = addr,
+ .raw = &raw,
};
if (!data.regs)
data.regs = task_pt_regs(current);
- do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
}
EXPORT_SYMBOL_GPL(perf_tpcounter_event);
@@ -3697,6 +3929,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
{
+ /*
+ * Raw tracepoint data is a severe data leak, only allow root to
+ * have these.
+ */
+ if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
if (ftrace_profile_enable(counter->attr.config))
return NULL;
@@ -3826,13 +4066,14 @@ perf_counter_alloc(struct perf_counter_attr *attr,
hwc->sample_period = attr->sample_period;
if (attr->freq && attr->sample_freq)
hwc->sample_period = 1;
+ hwc->last_period = hwc->sample_period;
atomic64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+ * we currently do not support PERF_FORMAT_GROUP on inherited counters
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
goto done;
switch (attr->type) {
@@ -3875,6 +4116,8 @@ done:
atomic_inc(&nr_mmap_counters);
if (counter->attr.comm)
atomic_inc(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_inc(&nr_task_counters);
}
return counter;
@@ -4236,8 +4479,10 @@ void perf_counter_exit_task(struct task_struct *child)
struct perf_counter_context *child_ctx;
unsigned long flags;
- if (likely(!child->perf_counter_ctxp))
+ if (likely(!child->perf_counter_ctxp)) {
+ perf_counter_task(child, NULL, 0);
return;
+ }
local_irq_save(flags);
/*
@@ -4262,8 +4507,14 @@ void perf_counter_exit_task(struct task_struct *child)
* the counters from it.
*/
unclone_ctx(child_ctx);
- spin_unlock(&child_ctx->lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Report the task dead after unscheduling the counters so that we
+ * won't get any samples after PERF_EVENT_EXIT. We can however still
+ * get a few PERF_EVENT_READ events.
+ */
+ perf_counter_task(child, child_ctx, 0);
/*
* We can recurse on the same lock type through:
@@ -4484,6 +4735,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
perf_counter_init_cpu(cpu);
break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ hw_perf_counter_setup_online(cpu);
+ break;
+
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
perf_counter_exit_cpu(cpu);
@@ -4508,6 +4764,8 @@ void __init perf_counter_init(void)
{
perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
+ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+ (void *)(long)smp_processor_id());
register_cpu_notifier(&perf_cpu_nb);
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b..e33a21cb940 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
- struct task_cputime cputime;
+ struct signal_struct *const sig = tsk->signal;
- thread_group_cputimer(tsk, &cputime);
cleanup_timers(tsk->signal->cpu_timers,
- cputime.utime, cputime.stime, cputime.sum_exec_runtime);
+ cputime_add(tsk->utime, sig->utime),
+ cputime_add(tsk->stime, sig->stime),
+ tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c..d089d052c4a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
return -EOPNOTSUPP;
}
+static int no_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsave, struct timespec __user *rmtp)
+{
+ return -EOPNOTSUPP;
+}
+
/*
* Return nonzero if we know a priori this clockid_t value is bogus.
*/
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
.clock_get = posix_get_monotonic_raw,
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
+ .nsleep = no_nsleep,
};
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082c320e4db..307c285af59 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
if (!dumpable && !capable(CAP_SYS_PTRACE))
return -EPERM;
- return security_ptrace_may_access(task, mode);
+ return security_ptrace_access_check(task, mode);
}
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5..29bd4baf9e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
/* We got the lock for task. */
debug_rt_mutex_lock(lock);
-
rt_mutex_set_owner(lock, task, 0);
-
+ spin_unlock(&lock->wait_lock);
rt_mutex_deadlock_account_lock(lock, task);
return 1;
}
ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
-
if (ret && !waiter->task) {
/*
* Reset the return value. We might have
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6c251790dd..d014efbf947 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
- if (lowest_mask)
+ if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ continue;
+ }
+
return 1;
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9ffb2b2ceba..652e8bdef9a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -611,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
+ struct task_struct *tsk = NULL;
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
- struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -624,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
- account_scheduler_latency(tsk, delta >> 10, 1);
+ if (tsk)
+ account_scheduler_latency(tsk, delta >> 10, 1);
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
- struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -639,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->block_start = 0;
se->sum_sleep_runtime += delta;
- /*
- * Blocking time is in units of nanosecs, so shift by 20 to
- * get a milliseconds-range estimation of the amount of
- * time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
-
- profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
- delta >> 20);
+ if (tsk) {
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
}
- account_scheduler_latency(tsk, delta >> 10, 0);
}
#endif
}
diff --git a/kernel/signal.c b/kernel/signal.c
index ccf1ceedaeb..64c5deeaca5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
stack_t oss;
int error;
- if (uoss) {
- oss.ss_sp = (void __user *) current->sas_ss_sp;
- oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp);
- }
+ oss.ss_sp = (void __user *) current->sas_ss_sp;
+ oss.ss_size = current->sas_ss_size;
+ oss.ss_flags = sas_ss_flags(sp);
if (uss) {
void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
int ss_flags;
error = -EFAULT;
- if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
- || __get_user(ss_sp, &uss->ss_sp)
- || __get_user(ss_flags, &uss->ss_flags)
- || __get_user(ss_size, &uss->ss_size))
+ if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
+ goto out;
+ error = __get_user(ss_sp, &uss->ss_sp) |
+ __get_user(ss_flags, &uss->ss_flags) |
+ __get_user(ss_size, &uss->ss_size);
+ if (error)
goto out;
error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
current->sas_ss_size = ss_size;
}
+ error = 0;
if (uoss) {
error = -EFAULT;
- if (copy_to_user(uoss, &oss, sizeof(oss)))
+ if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
goto out;
+ error = __put_user(oss.ss_sp, &uoss->ss_sp) |
+ __put_user(oss.ss_size, &uoss->ss_size) |
+ __put_user(oss.ss_flags, &uoss->ss_flags);
}
- error = 0;
out:
return error;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d850120..94188b8ecc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_BAD;
break;
-#ifdef CONFIG_CPU_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98e02328c67..71d8dc7f992 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1306,10 +1306,10 @@ static struct ctl_table vm_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "mmap_min_addr",
- .data = &mmap_min_addr,
- .maxlen = sizeof(unsigned long),
+ .data = &dac_mmap_min_addr,
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &mmap_min_addr_handler,
},
#ifdef CONFIG_NUMA
{
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a6dcd67b041..620b58abdc3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
*/
int clockevents_register_notifier(struct notifier_block *nb)
{
+ unsigned long flags;
int ret;
- spin_lock(&clockevents_lock);
+ spin_lock_irqsave(&clockevents_lock, flags);
ret = raw_notifier_chain_register(&clockevents_chain, nb);
- spin_unlock(&clockevents_lock);
+ spin_unlock_irqrestore(&clockevents_lock, flags);
return ret;
}
@@ -178,16 +179,18 @@ static void clockevents_notify_released(void)
*/
void clockevents_register_device(struct clock_event_device *dev)
{
+ unsigned long flags;
+
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
BUG_ON(!dev->cpumask);
- spin_lock(&clockevents_lock);
+ spin_lock_irqsave(&clockevents_lock, flags);
list_add(&dev->list, &clockevent_devices);
clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
clockevents_notify_released();
- spin_unlock(&clockevents_lock);
+ spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
@@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
void clockevents_notify(unsigned long reason, void *arg)
{
struct list_head *node, *tmp;
+ unsigned long flags;
- spin_lock(&clockevents_lock);
+ spin_lock_irqsave(&clockevents_lock, flags);
clockevents_do_notify(reason, arg);
switch (reason) {
@@ -251,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
default:
break;
}
- spin_unlock(&clockevents_lock);
+ spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 877dbedc311..c2ec25087a3 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
* Powerstate information: The system enters/leaves a state, where
* affected devices might stop
*/
-static void tick_do_broadcast_on_off(void *why)
+static void tick_do_broadcast_on_off(unsigned long *reason)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags, *reason = why;
+ unsigned long flags;
int cpu, bc_stopped;
spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
"offline CPU #%d\n", *oncpu);
else
- smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
- &reason, 1);
+ tick_do_broadcast_on_off(&reason);
}
/*
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a127..fddd69d16e0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
+ pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1090b0aed9b..7a34cb563fe 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)
{
debugfs_remove(bt->msg_file);
debugfs_remove(bt->dropped_file);
- debugfs_remove(bt->dir);
relay_close(bt->rchan);
+ debugfs_remove(bt->dir);
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
- struct dentry *parent = dentry->d_parent;
debugfs_remove(dentry);
- /*
- * this will fail for all but the last file, but that is ok. what we
- * care about is the top level buts->name directory going away, when
- * the last trace file is gone. Then we don't have to rmdir() that
- * manually on trace stop, so it nicely solves the issue with
- * force killing of running traces.
- */
-
- debugfs_remove(parent);
return 0;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1f3ec2afa51..25edd5cc593 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1662,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
mutex_lock(&ftrace_regex_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_filter_reset(enable);
if (file->f_mode & FMODE_READ) {
@@ -2278,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
read++;
cnt--;
- if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+ /*
+ * If the parser haven't finished with the last write,
+ * continue reading the user input without skipping spaces.
+ */
+ if (!(iter->flags & FTRACE_ITER_CONT)) {
/* skip white space */
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
@@ -2288,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
cnt--;
}
+ /* only spaces were written */
if (isspace(ch)) {
- file->f_pos += read;
+ *ppos += read;
ret = read;
goto out;
}
@@ -2319,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
if (ret)
goto out;
iter->buffer_idx = 0;
- } else
+ } else {
iter->flags |= FTRACE_ITER_CONT;
+ iter->buffer[iter->buffer_idx++] = ch;
+ }
-
- file->f_pos += read;
-
+ *ppos += read;
ret = read;
out:
mutex_unlock(&ftrace_regex_lock);
@@ -2577,7 +2582,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
mutex_lock(&graph_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
ftrace_graph_count = 0;
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bf27bb7a63e..a330513d96c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
put_online_cpus();
+ kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
kfree(buffer);
@@ -1785,7 +1786,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
*/
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
- if (!rb_try_to_discard(cpu_buffer, event))
+ if (rb_try_to_discard(cpu_buffer, event))
goto out;
/*
@@ -2383,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
* the box. Return the padding, and we will release
* the current locks, and try again.
*/
- rb_advance_reader(cpu_buffer);
return event;
case RINGBUF_TYPE_TIME_EXTEND:
@@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void)
* buffer too. A one time deal is all you get from reading
* the ring buffer from an NMI.
*/
- if (likely(!in_nmi() && !oops_in_progress))
+ if (likely(!in_nmi()))
return 1;
tracing_off_permanent();
@@ -2519,6 +2519,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(buffer, cpu, ts);
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ rb_advance_reader(cpu_buffer);
if (dolock)
spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
@@ -2590,12 +2592,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(buffer, cpu, ts);
- if (!event)
- goto out_unlock;
-
- rb_advance_reader(cpu_buffer);
+ if (event)
+ rb_advance_reader(cpu_buffer);
- out_unlock:
if (dolock)
spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8bc8d8afea6..8c358395d33 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
}
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
int type,
@@ -2031,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
long cpu = (long) inode->i_private;
if (cpu == TRACE_PIPE_ALL_CPU)
@@ -3085,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
break;
}
- trace_consume(iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
rem -= count;
if (!find_next_entry_inc(iter)) {
rem = 0;
@@ -3894,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret < 0)
return ret;
- switch (val) {
- case 0:
- trace_flags &= ~(1 << index);
- break;
- case 1:
- trace_flags |= 1 << index;
- break;
-
- default:
+ if (val != 0 && val != 1)
return -EINVAL;
- }
+ set_tracer_flags(1 << index, val);
*ppos += cnt;
@@ -4233,8 +4227,11 @@ static void __ftrace_dump(bool disable_tracing)
iter.pos = -1;
if (find_next_entry_inc(&iter) != NULL) {
- print_trace_line(&iter);
- trace_consume(&iter);
+ int ret;
+
+ ret = print_trace_line(&iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(&iter);
}
trace_printk_seq(&iter.seq);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3548ae5cc78..8b9f4f6e955 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void tracing_generic_entry_update(struct trace_entry *entry,
- unsigned long flags,
- int pc);
-
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecf..11ba5bb4ed0 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id) {
+ if (event->id == event_id && event->profile_enable) {
ret = event->profile_enable(event);
break;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53c8fd376a8..e75276a49cf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
const struct seq_operations *seq_ops;
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_clear_events();
seq_ops = inode->i_private;
@@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
entry = trace_create_file("enable", 0644, call->dir, call,
enable);
- if (call->id)
+ if (call->id && call->profile_enable)
entry = trace_create_file("id", 0444, call->dir, call,
id);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621bbf4..f32dc9d1ea7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
return -ENOSPC;
}
- filter->preds[filter->n_preds] = pred;
- filter->n_preds++;
-
list_for_each_entry(call, &ftrace_events, list) {
if (!call->define_fields)
@@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
}
replace_filter_string(call->filter, filter_string);
}
+
+ filter->preds[filter->n_preds] = pred;
+ filter->n_preds++;
out:
return err;
}
@@ -1029,12 +1029,17 @@ static int replace_preds(struct event_subsystem *system,
if (elt->op == OP_AND || elt->op == OP_OR) {
pred = create_logical_pred(elt->op);
+ if (!pred)
+ return -ENOMEM;
if (call) {
err = filter_add_pred(ps, call, pred);
filter_free_pred(pred);
- } else
+ } else {
err = filter_add_subsystem_pred(ps, system,
pred, filter_string);
+ if (err)
+ filter_free_pred(pred);
+ }
if (err)
return err;
@@ -1048,12 +1053,17 @@ static int replace_preds(struct event_subsystem *system,
}
pred = create_pred(elt->op, operand1, operand2);
+ if (!pred)
+ return -ENOMEM;
if (call) {
err = filter_add_pred(ps, call, pred);
filter_free_pred(pred);
- } else
+ } else {
err = filter_add_subsystem_pred(ps, system, pred,
filter_string);
+ if (err)
+ filter_free_pred(pred);
+ }
if (err)
return err;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb5..420ec348757 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- struct ftrace_graph_ent_entry *field;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry *field, saved;
trace_assign_type(field, entry);
- return print_graph_entry(field, s, iter);
+ saved = *field;
+ return print_graph_entry(&saved, s, iter);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 7b627811082..687699d365a 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
const char *str = *fmt;
int i;
- seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+ seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
/*
* Tabs and new lines need to be converted.
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275c..c4bd3d825f3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
#include <linux/wait.h>
#include <linux/hash.h>
-void init_waitqueue_head(wait_queue_head_t *q)
+void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
{
spin_lock_init(&q->lock);
+ lockdep_set_class(&q->lock, key);
INIT_LIST_HEAD(&q->task_list);
}
-EXPORT_SYMBOL(init_waitqueue_head);
+EXPORT_SYMBOL(__init_waitqueue_head);
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d881..3c44b56b0da 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -600,7 +600,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
* schedule_work - put work task in global workqueue
* @work: job to be done
*
- * This puts a job in the kernel-global workqueue.
+ * Returns zero if @work was already on the kernel-global workqueue and
+ * non-zero otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
*/
int schedule_work(struct work_struct *work)
{