diff options
Diffstat (limited to 'kernel')
31 files changed, 2208 insertions, 891 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c786646..e683869365d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) +static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = ndp->path.dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct audit_parent *parent; int ret; @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } /* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata *ndparent, *ndwatch; + struct nameidata nd; + struct dentry *d; int err; - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; + err = kern_path_parent(watch->path, &nd); + if (err) + return err; - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; } - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - *ndp = ndparent; - *ndw = ndwatch; - + *parent = nd.path; + dput(d); return 0; } -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - /* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; struct audit_parent *parent; - struct nameidata *ndp = NULL, *ndw = NULL; + struct path parent_path; int h, ret = 0; mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch->path, &ndp, &ndw); - if (ret) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - goto error; - } + ret = audit_get_nd(watch, &parent_path); + /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } + if (ret) + return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(ndp->path.dentry->d_inode); + parent = audit_find_parent(parent_path.dentry->d_inode); if (!parent) { - parent = audit_init_parent(ndp); + parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto error; @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ + path_put(&parent_path); return ret; - } void audit_remove_watch_rule(struct audit_krule *krule) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83..95362d15128 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } task_unlock(tsk); + if (cg) put_css_set_taskexit(cg); } @@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4349935c2ad..e92e9818903 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; trialcs = alloc_trial_cpuset(cs); - if (!trialcs) - return -ENOMEM; + if (!trialcs) { + retval = -ENOMEM; + goto out; + } switch (cft->private) { case FILE_CPULIST: @@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); +out: cgroup_unlock(); return retval; } diff --git a/kernel/cred.c b/kernel/cred.c index 3a9d6dd53a6..2343c132c5a 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; static struct thread_group_cred init_tgcred = { .usage = ATOMIC_INIT(2), .tgid = 0, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), }; #endif diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd..bda41571538 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, return NULL; } -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) { - u32 curval; + int ret; pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); - return curval; + return ret; } static int get_futex_value_locked(u32 *dest, u32 __user *from) @@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval; + u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: ret = lock_taken = 0; @@ -684,19 +685,17 @@ retry: * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = task_pid_vnr(task); + newval = vpid; if (set_waiters) newval |= FUTEX_WAITERS; - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* @@ -723,14 +722,12 @@ retry: */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + newval = (curval & ~FUTEX_TID_MASK) | vpid; ownerdied = 0; lock_taken = 1; } - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; if (unlikely(curval != uval)) goto retry; @@ -775,6 +772,24 @@ retry: return ret; } +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) + || plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q) */ get_task_struct(p); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as * q->lock_ptr = NULL is written, without taking any locks. A @@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; @@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; if (oldval != uval) return -EAGAIN; @@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_del(&q->list, &hb1->chain); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb2->lock; -#endif } get_futex_key_refs(key2); q->key = *key2; @@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, get_futex_key_refs(key); q->key = *key; - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; q->lock_ptr = &hb->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif wake_up_state(q->task, TASK_NORMAL); } @@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1504,8 +1505,7 @@ retry: spin_unlock(lock_ptr); goto retry; } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(q->pi_state); @@ -1525,8 +1525,7 @@ retry: static void unqueue_me_pi(struct futex_q *q) __releases(q->lock_ptr) { - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); @@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state * because we can fault here. Imagine swapped out pages or a fork @@ -1578,9 +1577,7 @@ retry: while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) goto handle_fault; if (curval == uval) break; @@ -1608,8 +1605,8 @@ retry: /* * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the * chance to try the fixup of the pi_state. So once we are * back from handling the fault we need to check the pi_state * after reacquiring the hash bucket lock and before trying to @@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * pi_state is incorrect, some other task did a lock steal and * we returned due to timeout or signal without taking the - * rt_mutex. Too late. We can access the rt_mutex_owner without - * locking, as the other task is now blocked on the hash bucket - * lock. Fix the state up. + * rt_mutex. Too late. */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } /* * Paranoia check. If we did not take the lock, then we should not be - * the owner, nor the pending owner, of the rt_mutex. + * the owner of the rt_mutex. */ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " @@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); @@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); int ret; retry: @@ -2057,7 +2057,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) + if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); @@ -2072,17 +2072,14 @@ retry: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); - - - if (unlikely(uval == -EFAULT)) + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == task_pid_vnr(current))) + if (unlikely(uval == vpid)) goto out_unlock; /* @@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &q->list.plist); + plist_del(&q->list, &hb->chain); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2463,11 +2460,20 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) - return -1; - + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } if (nval != uval) goto retry; @@ -2678,8 +2684,7 @@ static int __init futex_init(void) * implementation, the non-functional ones will return * -ENOSYS. */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c04861..e38f5a073d0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -334,6 +334,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); static struct debug_obj_descr hrtimer_debug_descr; +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -393,6 +398,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9033c1c7082..2782bacdf49 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { - if (resume) + if (resume) { + if (!(desc->status & IRQ_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } desc->status &= ~IRQ_SUSPENDED; + } switch (desc->depth) { case 0: diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b0..d6bfb89cce9 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -53,9 +53,6 @@ void resume_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - if (!(desc->status & IRQ_SUSPENDED)) - continue; - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 656222fcf76..ed253aa24ba 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,13 +38,96 @@ #include <asm/irq_regs.h> +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; -atomic_t perf_task_events __read_mostly; +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +atomic_t perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* * max perf event sample rate */ -int sysctl_perf_event_sample_rate __read_mostly = 100000; +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} static atomic64_t perf_event_id; @@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } @@ -89,6 +193,360 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event-> |