diff options
Diffstat (limited to 'kernel/events/core.c')
| -rw-r--r-- | kernel/events/core.c | 713 | 
1 files changed, 454 insertions, 259 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index dd236b66ca3..6b17ac1b0c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -39,6 +39,8 @@  #include <linux/hw_breakpoint.h>  #include <linux/mm_types.h>  #include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/mman.h>  #include "internal.h" @@ -119,7 +121,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)  #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\  		       PERF_FLAG_FD_OUTPUT  |\ -		       PERF_FLAG_PID_CGROUP) +		       PERF_FLAG_PID_CGROUP |\ +		       PERF_FLAG_FD_CLOEXEC)  /*   * branch priv levels that need permission checks @@ -175,8 +178,8 @@ int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;  static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);  static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = -	ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = +	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;  void update_perf_cpu_limits(void)  { @@ -184,7 +187,7 @@ void update_perf_cpu_limits(void)  	tmp *= sysctl_perf_cpu_time_max_percent;  	do_div(tmp, 100); -	atomic_set(&perf_sample_allowed_ns, tmp); +	ACCESS_ONCE(perf_sample_allowed_ns) = tmp;  }  static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +196,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp,  		loff_t *ppos)  { -	int ret = proc_dointvec(table, write, buffer, lenp, ppos); +	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);  	if (ret || !write)  		return ret; @@ -228,14 +231,33 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,   * we detect that events are taking too long.   */  #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length); + +static void perf_duration_warn(struct irq_work *w) +{ +	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); +	u64 avg_local_sample_len; +	u64 local_samples_len; + +	local_samples_len = __get_cpu_var(running_sample_length); +	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + +	printk_ratelimited(KERN_WARNING +			"perf interrupt took too long (%lld > %lld), lowering " +			"kernel.perf_event_max_sample_rate to %d\n", +			avg_local_sample_len, allowed_ns >> 1, +			sysctl_perf_event_sample_rate); +} + +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);  void perf_sample_event_took(u64 sample_len_ns)  { +	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);  	u64 avg_local_sample_len;  	u64 local_samples_len; -	if (atomic_read(&perf_sample_allowed_ns) == 0) +	if (allowed_ns == 0)  		return;  	/* decay the counter by 1 average sample */ @@ -251,7 +273,7 @@ void perf_sample_event_took(u64 sample_len_ns)  	 */  	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; -	if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) +	if (avg_local_sample_len <= allowed_ns)  		return;  	if (max_samples_per_tick <= 1) @@ -261,14 +283,14 @@ void perf_sample_event_took(u64 sample_len_ns)  	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;  	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; -	printk_ratelimited(KERN_WARNING -			"perf samples too long (%lld > %d), lowering " -			"kernel.perf_event_max_sample_rate to %d\n", -			avg_local_sample_len, -			atomic_read(&perf_sample_allowed_ns), -			sysctl_perf_event_sample_rate); -  	update_perf_cpu_limits(); + +	if (!irq_work_queue(&perf_duration_work)) { +		early_printk("perf interrupt took too long (%lld > %lld), lowering " +			     "kernel.perf_event_max_sample_rate to %d\n", +			     avg_local_sample_len, allowed_ns >> 1, +			     sysctl_perf_event_sample_rate); +	}  }  static atomic64_t perf_event_id; @@ -341,7 +363,7 @@ struct perf_cgroup {  static inline struct perf_cgroup *  perf_cgroup_from_task(struct task_struct *task)  { -	return container_of(task_css(task, perf_subsys_id), +	return container_of(task_css(task, perf_event_cgrp_id),  			    struct perf_cgroup, css);  } @@ -369,11 +391,6 @@ perf_cgroup_match(struct perf_event *event)  				    event->cgrp->css.cgroup);  } -static inline bool perf_tryget_cgroup(struct perf_event *event) -{ -	return css_tryget(&event->cgrp->css); -} -  static inline void perf_put_cgroup(struct perf_event *event)  {  	css_put(&event->cgrp->css); @@ -592,9 +609,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  	if (!f.file)  		return -EBADF; -	rcu_read_lock(); - -	css = css_from_dir(f.file->f_dentry, &perf_subsys); +	css = css_tryget_online_from_dir(f.file->f_dentry, +					 &perf_event_cgrp_subsys);  	if (IS_ERR(css)) {  		ret = PTR_ERR(css);  		goto out; @@ -603,13 +619,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  	cgrp = container_of(css, struct perf_cgroup, css);  	event->cgrp = cgrp; -	/* must be done before we fput() the file */ -	if (!perf_tryget_cgroup(event)) { -		event->cgrp = NULL; -		ret = -ENOENT; -		goto out; -	} -  	/*  	 * all events in a group must monitor  	 * the same cgroup because a task belongs @@ -620,7 +629,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  		ret = -EINVAL;  	}  out: -	rcu_read_unlock();  	fdput(f);  	return ret;  } @@ -899,6 +907,7 @@ static void unclone_ctx(struct perf_event_context *ctx)  		put_ctx(ctx->parent_ctx);  		ctx->parent_ctx = NULL;  	} +	ctx->generation++;  }  static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1145,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)  	ctx->nr_events++;  	if (event->attr.inherit_stat)  		ctx->nr_stat++; + +	ctx->generation++;  }  /* @@ -1201,6 +1212,9 @@ static void perf_event__header_size(struct perf_event *event)  	if (sample_type & PERF_SAMPLE_DATA_SRC)  		size += sizeof(data->data_src.val); +	if (sample_type & PERF_SAMPLE_TRANSACTION) +		size += sizeof(data->txn); +  	event->header_size = size;  } @@ -1310,6 +1324,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)  	 */  	if (event->state > PERF_EVENT_STATE_OFF)  		event->state = PERF_EVENT_STATE_OFF; + +	ctx->generation++;  }  static void perf_group_detach(struct perf_event *event) @@ -1388,6 +1404,8 @@ event_sched_out(struct perf_event *event,  	if (event->state != PERF_EVENT_STATE_ACTIVE)  		return; +	perf_pmu_disable(event->pmu); +  	event->state = PERF_EVENT_STATE_INACTIVE;  	if (event->pending_disable) {  		event->pending_disable = 0; @@ -1404,6 +1422,8 @@ event_sched_out(struct perf_event *event,  		ctx->nr_freq--;  	if (event->attr.exclusive || !cpuctx->active_oncpu)  		cpuctx->exclusive = 0; + +	perf_pmu_enable(event->pmu);  }  static void @@ -1426,6 +1446,11 @@ group_sched_out(struct perf_event *group_event,  		cpuctx->exclusive = 0;  } +struct remove_event { +	struct perf_event *event; +	bool detach_group; +}; +  /*   * Cross CPU call to remove a performance event   * @@ -1434,12 +1459,15 @@ group_sched_out(struct perf_event *group_event,   */  static int __perf_remove_from_context(void *info)  { -	struct perf_event *event = info; +	struct remove_event *re = info; +	struct perf_event *event = re->event;  	struct perf_event_context *ctx = event->ctx;  	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);  	raw_spin_lock(&ctx->lock);  	event_sched_out(event, cpuctx, ctx); +	if (re->detach_group) +		perf_group_detach(event);  	list_del_event(event, ctx);  	if (!ctx->nr_events && cpuctx->task_ctx == ctx) {  		ctx->is_active = 0; @@ -1464,10 +1492,14 @@ static int __perf_remove_from_context(void *info)   * When called from perf_event_exit_task, it's OK because the   * context has been detached from its task.   */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group)  {  	struct perf_event_context *ctx = event->ctx;  	struct task_struct *task = ctx->task; +	struct remove_event re = { +		.event = event, +		.detach_group = detach_group, +	};  	lockdep_assert_held(&ctx->mutex); @@ -1476,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event)  		 * Per cpu events are removed via an smp call and  		 * the removal is always successful.  		 */ -		cpu_function_call(event->cpu, __perf_remove_from_context, event); +		cpu_function_call(event->cpu, __perf_remove_from_context, &re);  		return;  	}  retry: -	if (!task_function_call(task, __perf_remove_from_context, event)) +	if (!task_function_call(task, __perf_remove_from_context, &re))  		return;  	raw_spin_lock_irq(&ctx->lock); @@ -1498,6 +1530,8 @@ retry:  	 * Since the task isn't running, its safe to remove the event, us  	 * holding the ctx->lock ensures the task won't get scheduled in.  	 */ +	if (detach_group) +		perf_group_detach(event);  	list_del_event(event, ctx);  	raw_spin_unlock_irq(&ctx->lock);  } @@ -1644,6 +1678,9 @@ event_sched_in(struct perf_event *event,  		 struct perf_event_context *ctx)  {  	u64 tstamp = perf_event_time(event); +	int ret = 0; + +	lockdep_assert_held(&ctx->lock);  	if (event->state <= PERF_EVENT_STATE_OFF)  		return 0; @@ -1666,10 +1703,13 @@ event_sched_in(struct perf_event *event,  	 */  	smp_wmb(); +	perf_pmu_disable(event->pmu); +  	if (event->pmu->add(event, PERF_EF_START)) {  		event->state = PERF_EVENT_STATE_INACTIVE;  		event->oncpu = -1; -		return -EAGAIN; +		ret = -EAGAIN; +		goto out;  	}  	event->tstamp_running += tstamp - event->tstamp_stopped; @@ -1685,7 +1725,10 @@ event_sched_in(struct perf_event *event,  	if (event->attr.exclusive)  		cpuctx->exclusive = 1; -	return 0; +out: +	perf_pmu_enable(event->pmu); + +	return ret;  }  static int @@ -1694,7 +1737,7 @@ group_sched_in(struct perf_event *group_event,  	       struct perf_event_context *ctx)  {  	struct perf_event *event, *partial_group = NULL; -	struct pmu *pmu = group_event->pmu; +	struct pmu *pmu = ctx->pmu;  	u64 now = ctx->time;  	bool simulate = false; @@ -2146,22 +2189,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,  }  /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event().   */  static int context_equiv(struct perf_event_context *ctx1,  			 struct perf_event_context *ctx2)  { -	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx -		&& ctx1->parent_gen == ctx2->parent_gen -		&& !ctx1->pin_count && !ctx2->pin_count; +	/* Pinning disables the swap optimization */ +	if (ctx1->pin_count || ctx2->pin_count) +		return 0; + +	/* If ctx1 is the parent of ctx2 */ +	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) +		return 1; + +	/* If ctx2 is the parent of ctx1 */ +	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) +		return 1; + +	/* +	 * If ctx1 and ctx2 have the same parent; we flatten the parent +	 * hierarchy, see perf_event_init_context(). +	 */ +	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && +			ctx1->parent_gen == ctx2->parent_gen) +		return 1; + +	/* Unmatched */ +	return 0;  }  static void __perf_event_sync_stat(struct perf_event *event, @@ -2210,9 +2269,6 @@ static void __perf_event_sync_stat(struct perf_event *event,  	perf_event_update_userpage(next_event);  } -#define list_next_entry(pos, member) \ -	list_entry(pos->member.next, typeof(*pos), member) -  static void perf_event_sync_stat(struct perf_event_context *ctx,  				   struct perf_event_context *next_ctx)  { @@ -2244,7 +2300,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,  {  	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];  	struct perf_event_context *next_ctx; -	struct perf_event_context *parent; +	struct perf_event_context *parent, *next_parent;  	struct perf_cpu_context *cpuctx;  	int do_switch = 1; @@ -2256,10 +2312,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,  		return;  	rcu_read_lock(); -	parent = rcu_dereference(ctx->parent_ctx);  	next_ctx = next->perf_event_ctxp[ctxn]; -	if (parent && next_ctx && -	    rcu_dereference(next_ctx->parent_ctx) == parent) { +	if (!next_ctx) +		goto unlock; + +	parent = rcu_dereference(ctx->parent_ctx); +	next_parent = rcu_dereference(next_ctx->parent_ctx); + +	/* If neither context have a parent context; they cannot be clones. */ +	if (!parent || !next_parent) +		goto unlock; + +	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {  		/*  		 * Looks like the two contexts are clones, so we might be  		 * able to optimize the context switch.  We lock both @@ -2287,6 +2351,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,  		raw_spin_unlock(&next_ctx->lock);  		raw_spin_unlock(&ctx->lock);  	} +unlock:  	rcu_read_unlock();  	if (do_switch) { @@ -2521,8 +2586,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,  		if (cpuctx->ctx.nr_branch_stack > 0  		    && pmu->flush_branch_stack) { -			pmu = cpuctx->ctx.pmu; -  			perf_ctx_lock(cpuctx, cpuctx->task_ctx);  			perf_pmu_disable(pmu); @@ -2713,6 +2776,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,  		if (!event_filter_match(event))  			continue; +		perf_pmu_disable(event->pmu); +  		hwc = &event->hw;  		if (hwc->interrupts == MAX_INTERRUPTS) { @@ -2722,7 +2787,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,  		}  		if (!event->attr.freq || !event->attr.sample_freq) -			continue; +			goto next;  		/*  		 * stop the event and update event->count @@ -2744,6 +2809,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,  			perf_adjust_period(event, period, delta, false);  		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); +	next: +		perf_pmu_enable(event->pmu);  	}  	perf_pmu_enable(ctx->pmu); @@ -2908,6 +2975,22 @@ out:  	local_irq_restore(flags);  } +void perf_event_exec(void) +{ +	struct perf_event_context *ctx; +	int ctxn; + +	rcu_read_lock(); +	for_each_task_context_nr(ctxn) { +		ctx = current->perf_event_ctxp[ctxn]; +		if (!ctx) +			continue; + +		perf_event_enable_on_exec(ctx); +	} +	rcu_read_unlock(); +} +  /*   * Cross CPU call to read the hardware event   */ @@ -3130,7 +3213,8 @@ static void free_event_rcu(struct rcu_head *head)  }  static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, +			       struct ring_buffer *rb);  static void unaccount_event_cpu(struct perf_event *event, int cpu)  { @@ -3181,17 +3265,19 @@ static void __free_event(struct perf_event *event)  	if (event->ctx)  		put_ctx(event->ctx); +	if (event->pmu) +		module_put(event->pmu->module); +  	call_rcu(&event->rcu_head, free_event_rcu);  } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event)  {  	irq_work_sync(&event->pending);  	unaccount_event(event);  	if (event->rb) { -		struct ring_buffer *rb; -  		/*  		 * Can happen when we close an event with re-directed output.  		 * @@ -3199,57 +3285,38 @@ static void free_event(struct perf_event *event)  		 * over us; possibly making our ring_buffer_put() the last.  		 */  		mutex_lock(&event->mmap_mutex); -		rb = event->rb; -		if (rb) { -			rcu_assign_pointer(event->rb, NULL); -			ring_buffer_detach(event, rb); -			ring_buffer_put(rb); /* could be last */ -		} +		ring_buffer_attach(event, NULL);  		mutex_unlock(&event->mmap_mutex);  	}  	if (is_cgroup_event(event))  		perf_detach_cgroup(event); -  	__free_event(event);  } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event)  { -	struct perf_event_context *ctx = event->ctx; - -	WARN_ON_ONCE(ctx->parent_ctx); -	/* -	 * There are two ways this annotation is useful: -	 * -	 *  1) there is a lock recursion from perf_event_exit_task -	 *     see the comment there. -	 * -	 *  2) there is a lock-inversion with mmap_sem through -	 *     perf_event_read_group(), which takes faults while -	 *     holding ctx->mutex, however this is called after -	 *     the last filedesc died, so there is no possibility -	 *     to trigger the AB-BA case. -	 */ -	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); -	raw_spin_lock_irq(&ctx->lock); -	perf_group_detach(event); -	raw_spin_unlock_irq(&ctx->lock); -	perf_remove_from_context(event); -	mutex_unlock(&ctx->mutex); - -	free_event(event); +	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, +				"unexpected event refcount: %ld; ptr=%p\n", +				atomic_long_read(&event->refcount), event)) { +		/* leak to avoid use-after-free */ +		return; +	} -	return 0; +	_free_event(event);  } -EXPORT_SYMBOL_GPL(perf_event_release_kernel);  /*   * Called when the last reference to the file is gone.   */  static void put_event(struct perf_event *event)  { +	struct perf_event_context *ctx = event->ctx;  	struct task_struct *owner;  	if (!atomic_long_dec_and_test(&event->refcount)) @@ -3288,9 +3355,33 @@ static void put_event(struct perf_event *event)  		put_task_struct(owner);  	} -	perf_event_release_kernel(event); +	WARN_ON_ONCE(ctx->parent_ctx); +	/* +	 * There are two ways this annotation is useful: +	 * +	 *  1) there is a lock recursion from perf_event_exit_task +	 *     see the comment there. +	 * +	 *  2) there is a lock-inversion with mmap_sem through +	 *     perf_event_read_group(), which takes faults while +	 *     holding ctx->mutex, however this is called after +	 *     the last filedesc died, so there is no possibility +	 *     to trigger the AB-BA case. +	 */ +	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); +	perf_remove_from_context(event, true); +	mutex_unlock(&ctx->mutex); + +	_free_event(event);  } +int perf_event_release_kernel(struct perf_event *event) +{ +	put_event(event); +	return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); +  static int perf_release(struct inode *inode, struct file *file)  {  	put_event(file->private_data); @@ -3497,7 +3588,7 @@ static void perf_event_for_each(struct perf_event *event,  static int perf_event_period(struct perf_event *event, u64 __user *arg)  {  	struct perf_event_context *ctx = event->ctx; -	int ret = 0; +	int ret = 0, active;  	u64 value;  	if (!is_sampling_event(event)) @@ -3521,6 +3612,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)  		event->attr.sample_period = value;  		event->hw.sample_period = value;  	} + +	active = (event->state == PERF_EVENT_STATE_ACTIVE); +	if (active) { +		perf_pmu_disable(ctx->pmu); +		event->pmu->stop(event, PERF_EF_UPDATE); +	} + +	local64_set(&event->hw.period_left, 0); + +	if (active) { +		event->pmu->start(event, PERF_EF_RELOAD); +		perf_pmu_enable(ctx->pmu); +	} +  unlock:  	raw_spin_unlock_irq(&ctx->lock); @@ -3660,6 +3765,26 @@ static void calc_timer_values(struct perf_event *event,  	*running = ctx_time - event->tstamp_running;  } +static void perf_event_init_userpage(struct perf_event *event) +{ +	struct perf_event_mmap_page *userpg; +	struct ring_buffer *rb; + +	rcu_read_lock(); +	rb = rcu_dereference(event->rb); +	if (!rb) +		goto unlock; + +	userpg = rb->user_page; + +	/* Allow new userspace to detect that bit 0 is deprecated */ +	userpg->cap_bit0_is_deprecated = 1; +	userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + +unlock: +	rcu_read_unlock(); +} +  void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)  {  } @@ -3757,28 +3882,47 @@ unlock:  static void ring_buffer_attach(struct perf_event *event,  			       struct ring_buffer *rb)  { +	struct ring_buffer *old_rb = NULL;  	unsigned long flags; -	if (!list_empty(&event->rb_entry)) -		return; +	if (event->rb) { +		/* +		 * Should be impossible, we set this when removing +		 * event->rb_entry and wait/clear when adding event->rb_entry. +		 */ +		WARN_ON_ONCE(event->rcu_pending); -	spin_lock_irqsave(&rb->event_lock, flags); -	if (list_empty(&event->rb_entry)) -		list_add(&event->rb_entry, &rb->event_list); -	spin_unlock_irqrestore(&rb->event_lock, flags); -} +		old_rb = event->rb; +		event->rcu_batches = get_state_synchronize_rcu(); +		event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ -	unsigned long flags; +		spin_lock_irqsave(&old_rb->event_lock, flags); +		list_del_rcu(&event->rb_entry); +		spin_unlock_irqrestore(&old_rb->event_lock, flags); +	} -	if (list_empty(&event->rb_entry)) -		return; +	if (event->rcu_pending && rb) { +		cond_synchronize_rcu(event->rcu_batches); +		event->rcu_pending = 0; +	} -	spin_lock_irqsave(&rb->event_lock, flags); -	list_del_init(&event->rb_entry); -	wake_up_all(&event->waitq); -	spin_unlock_irqrestore(&rb->event_lock, flags); +	if (rb) { +		spin_lock_irqsave(&rb->event_lock, flags); +		list_add_rcu(&event->rb_entry, &rb->event_list); +		spin_unlock_irqrestore(&rb->event_lock, flags); +	} + +	rcu_assign_pointer(event->rb, rb); + +	if (old_rb) { +		ring_buffer_put(old_rb); +		/* +		 * Since we detached before setting the new rb, so that we +		 * could attach the new rb, we could have missed a wakeup. +		 * Provide it now. +		 */ +		wake_up_all(&event->waitq); +	}  }  static void ring_buffer_wakeup(struct perf_event *event) @@ -3847,7 +3991,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)  {  	struct perf_event *event = vma->vm_file->private_data; -	struct ring_buffer *rb = event->rb; +	struct ring_buffer *rb = ring_buffer_get(event);  	struct user_struct *mmap_user = rb->mmap_user;  	int mmap_locked = rb->mmap_locked;  	unsigned long size = perf_data_size(rb); @@ -3855,18 +3999,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)  	atomic_dec(&rb->mmap_count);  	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) -		return; +		goto out_put; -	/* Detach current event from the buffer. */ -	rcu_assign_pointer(event->rb, NULL); -	ring_buffer_detach(event, rb); +	ring_buffer_attach(event, NULL);  	mutex_unlock(&event->mmap_mutex);  	/* If there's still other mmap()s of this buffer, we're done. */ -	if (atomic_read(&rb->mmap_count)) { -		ring_buffer_put(rb); /* can't be last */ -		return; -	} +	if (atomic_read(&rb->mmap_count)) +		goto out_put;  	/*  	 * No other mmap()s, detach from all other events that might redirect @@ -3896,11 +4036,9 @@ again:  		 * still restart the iteration to make sure we're not now  		 * iterating the wrong list.  		 */ -		if (event->rb == rb) { -			rcu_assign_pointer(event->rb, NULL); -			ring_buffer_detach(event, rb); -			ring_buffer_put(rb); /* can't be last, we still have one */ -		} +		if (event->rb == rb) +			ring_buffer_attach(event, NULL); +  		mutex_unlock(&event->mmap_mutex);  		put_event(event); @@ -3925,6 +4063,7 @@ again:  	vma->vm_mm->pinned_vm -= mmap_locked;  	free_uid(mmap_user); +out_put:  	ring_buffer_put(rb); /* could be last */  } @@ -4042,8 +4181,8 @@ again:  	vma->vm_mm->pinned_vm += extra;  	ring_buffer_attach(event, rb); -	rcu_assign_pointer(event->rb, rb); +	perf_event_init_userpage(event);  	perf_event_update_userpage(event);  unlock: @@ -4551,6 +4690,9 @@ void perf_output_sample(struct perf_output_handle *handle,  	if (sample_type & PERF_SAMPLE_DATA_SRC)  		perf_output_put(handle, data->data_src.val); +	if (sample_type & PERF_SAMPLE_TRANSACTION) +		perf_output_put(handle, data->txn); +  	if (!event->attr.watermark) {  		int wakeup_events = event->attr.wakeup_events; @@ -4950,21 +5092,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)  		       NULL);  } -void perf_event_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task, bool exec)  {  	struct perf_comm_event comm_event; -	struct perf_event_context *ctx; -	int ctxn; - -	rcu_read_lock(); -	for_each_task_context_nr(ctxn) { -		ctx = task->perf_event_ctxp[ctxn]; -		if (!ctx) -			continue; - -		perf_event_enable_on_exec(ctx); -	} -	rcu_read_unlock();  	if (!atomic_read(&nr_comm_events))  		return; @@ -4976,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)  		.event_id  = {  			.header = {  				.type = PERF_RECORD_COMM, -				.misc = 0, +				.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,  				/* .size */  			},  			/* .pid */ @@ -4999,6 +5129,7 @@ struct perf_mmap_event {  	int			maj, min;  	u64			ino;  	u64			ino_generation; +	u32			prot, flags;  	struct {  		struct perf_event_header	header; @@ -5040,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,  		mmap_event->event_id.header.size += sizeof(mmap_event->min);  		mmap_event->event_id.header.size += sizeof(mmap_event->ino);  		mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); +		mmap_event->event_id.header.size += sizeof(mmap_event->prot); +		mmap_event->event_id.header.size += sizeof(mmap_event->flags);  	}  	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); @@ -5058,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,  		perf_output_put(&handle, mmap_event->min);  		perf_output_put(&handle, mmap_event->ino);  		perf_output_put(&handle, mmap_event->ino_generation); +		perf_output_put(&handle, mmap_event->prot); +		perf_output_put(&handle, mmap_event->flags);  	}  	__output_copy(&handle, mmap_event->file_name, @@ -5076,30 +5211,30 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  	struct file *file = vma->vm_file;  	int maj = 0, min = 0;  	u64 ino = 0, gen = 0; +	u32 prot = 0, flags = 0;  	unsigned int size;  	char tmp[16];  	char *buf = NULL; -	const char *name; - -	memset(tmp, 0, sizeof(tmp)); +	char *name;  	if (file) {  		struct inode *inode;  		dev_t dev; + +		buf = kmalloc(PATH_MAX, GFP_KERNEL); +		if (!buf) { +			name = "//enomem"; +			goto cpy_name; +		}  		/* -		 * d_path works from the end of the rb backwards, so we +		 * d_path() works from the end of the rb backwards, so we  		 * need to add enough zero bytes after the string to handle  		 * the 64bit alignment we do later.  		 */ -		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); -		if (!buf) { -			name = strncpy(tmp, "//enomem", sizeof(tmp)); -			goto got_name; -		} -		name = d_path(&file->f_path, buf, PATH_MAX); +		name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));  		if (IS_ERR(name)) { -			name = strncpy(tmp, "//toolong", sizeof(tmp)); -			goto got_name; +			name = "//toolong"; +			goto cpy_name;  		}  		inode = file_inode(vma->vm_file);  		dev = inode->i_sb->s_dev; @@ -5108,33 +5243,60 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  		maj = MAJOR(dev);  		min = MINOR(dev); +		if (vma->vm_flags & VM_READ) +			prot |= PROT_READ; +		if (vma->vm_flags & VM_WRITE) +			prot |= PROT_WRITE; +		if (vma->vm_flags & VM_EXEC) +			prot |= PROT_EXEC; + +		if (vma->vm_flags & VM_MAYSHARE) +			flags = MAP_SHARED; +		else +			flags = MAP_PRIVATE; + +		if (vma->vm_flags & VM_DENYWRITE) +			flags |= MAP_DENYWRITE; +		if (vma->vm_flags & VM_MAYEXEC) +			flags |= MAP_EXECUTABLE; +		if (vma->vm_flags & VM_LOCKED) +			flags |= MAP_LOCKED; +		if (vma->vm_flags & VM_HUGETLB) +			flags |= MAP_HUGETLB; + +		goto got_name;  	} else { -		if (arch_vma_name(mmap_event->vma)) { -			name = strncpy(tmp, arch_vma_name(mmap_event->vma), -				       sizeof(tmp) - 1); -			tmp[sizeof(tmp) - 1] = '\0'; -			goto got_name; -		} +		name = (char *)arch_vma_name(vma); +		if (name) +			goto cpy_name; -		if (!vma->vm_mm) { -			name = strncpy(tmp, "[vdso]", sizeof(tmp)); -			goto got_name; -		} else if (vma->vm_start <= vma->vm_mm->start_brk && +		if (vma->vm_start <= vma->vm_mm->start_brk &&  				vma->vm_end >= vma->vm_mm->brk) { -			name = strncpy(tmp, "[heap]", sizeof(tmp)); -			goto got_name; -		} else if (vma->vm_start <= vma->vm_mm->start_stack && +			name = "[heap]"; +			goto cpy_name; +		} +		if (vma->vm_start <= vma->vm_mm->start_stack &&  				vma->vm_end >= vma->vm_mm->start_stack) { -			name = strncpy(tmp, "[stack]", sizeof(tmp)); -			goto got_name; +			name = "[stack]"; +			goto cpy_name;  		} -		name = strncpy(tmp, "//anon", sizeof(tmp)); -		goto got_name; +		name = "//anon"; +		goto cpy_name;  	} +cpy_name: +	strlcpy(tmp, name, sizeof(tmp)); +	name = tmp;  got_name: -	size = ALIGN(strlen(name)+1, sizeof(u64)); +	/* +	 * Since our buffer works in 8 byte units we need to align our string +	 * size to a multiple of 8. However, we must guarantee the tail end is +	 * zero'd out to avoid leaking random bits to userspace. +	 */ +	size = strlen(name)+1; +	while (!IS_ALIGNED(size, sizeof(u64))) +		name[size++] = '\0';  	mmap_event->file_name = name;  	mmap_event->file_size = size; @@ -5142,6 +5304,8 @@ got_name:  	mmap_event->min = min;  	mmap_event->ino = ino;  	mmap_event->ino_generation = gen; +	mmap_event->prot = prot; +	mmap_event->flags = flags;  	if (!(vma->vm_flags & VM_EXEC))  		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -5182,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)  		/* .min (attr_mmap2 only) */  		/* .ino (attr_mmap2 only) */  		/* .ino_generation (attr_mmap2 only) */ +		/* .prot (attr_mmap2 only) */ +		/* .flags (attr_mmap2 only) */  	};  	perf_event_mmap_event(&mmap_event); @@ -5318,6 +5484,9 @@ struct swevent_htable {  	/* Recursion avoidance in each contexts */  	int				recursion[PERF_NR_CONTEXTS]; + +	/* Keeps track of cpu being initialized/exited */ +	bool				online;  };  static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5564,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)  	hwc->state = !(flags & PERF_EF_START);  	head = find_swevent_head(swhash, event); -	if (WARN_ON_ONCE(!head)) +	if (!head) { +		/* +		 * We can race with cpu hotplug code. Do not +		 * WARN if the cpu just got unplugged. +		 */ +		WARN_ON_ONCE(swhash->online);  		return -EINVAL; +	}  	hlist_add_head_rcu(&event->hlist_entry, head); @@ -5622,11 +5797,6 @@ static void swevent_hlist_put(struct perf_event *event)  {  	int cpu; -	if (event->cpu != -1) { -		swevent_hlist_put_cpu(event, event->cpu); -		return; -	} -  	for_each_possible_cpu(cpu)  		swevent_hlist_put_cpu(event, cpu);  } @@ -5660,9 +5830,6 @@ static int swevent_hlist_get(struct perf_event *event)  	int err;  	int cpu, failed_cpu; -	if (event->cpu != -1) -		return swevent_hlist_get_cpu(event, event->cpu); -  	get_online_cpus();  	for_each_possible_cpu(cpu) {  		err = swevent_hlist_get_cpu(event, cpu); @@ -6214,7 +6381,7 @@ static int perf_event_idx_default(struct perf_event *event)   * Ensures all contexts with the same task_ctx_nr have the same   * pmu_cpu_context too.   */ -static void *find_pmu_context(int ctxn) +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)  {  	struct pmu *pmu; @@ -6271,6 +6438,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)  	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);  } +static DEVICE_ATTR_RO(type);  static ssize_t  perf_event_mux_interval_ms_show(struct device *dev, @@ -6315,17 +6483,19 @@ perf_event_mux_interval_ms_store(struct device *dev,  	return count;  } +static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static struct device_attribute pmu_dev_attrs[] = { -	__ATTR_RO(type), -	__ATTR_RW(perf_event_mux_interval_ms), -	__ATTR_NULL, +static struct attribute *pmu_dev_attrs[] = { +	&dev_attr_type.attr, +	&dev_attr_perf_event_mux_interval_ms.attr, +	NULL,  }; +ATTRIBUTE_GROUPS(pmu_dev);  static int pmu_bus_running;  static struct bus_type pmu_bus = {  	.name		= "event_source", -	.dev_attrs	= pmu_dev_attrs, +	.dev_groups	= pmu_dev_groups,  };  static void pmu_dev_release(struct device *dev) @@ -6466,6 +6636,7 @@ free_pdc:  	free_percpu(pmu->pmu_disable_count);  	goto unlock;  } +EXPORT_SYMBOL_GPL(perf_pmu_register);  void perf_pmu_unregister(struct pmu *pmu)  { @@ -6487,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu)  	put_device(pmu->dev);  	free_pmu_context(pmu);  } +EXPORT_SYMBOL_GPL(perf_pmu_unregister);  struct pmu *perf_init_event(struct perf_event *event)  { @@ -6500,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event)  	pmu = idr_find(&pmu_idr, event->attr.type);  	rcu_read_unlock();  	if (pmu) { +		if (!try_module_get(pmu->module)) { +			pmu = ERR_PTR(-ENODEV); +			goto unlock; +		}  		event->pmu = pmu;  		ret = pmu->event_init(event);  		if (ret) @@ -6508,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event)  	}  	list_for_each_entry_rcu(pmu, &pmus, entry) { +		if (!try_module_get(pmu->module)) { +			pmu = ERR_PTR(-ENODEV); +			goto unlock; +		}  		event->pmu = pmu;  		ret = pmu->event_init(event);  		if (!ret) @@ -6602,6 +6782,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	INIT_LIST_HEAD(&event->event_entry);  	INIT_LIST_HEAD(&event->sibling_list);  	INIT_LIST_HEAD(&event->rb_entry); +	INIT_LIST_HEAD(&event->active_entry); +	INIT_HLIST_NODE(&event->hlist_entry); +  	init_waitqueue_head(&event->waitq);  	init_irq_work(&event->pending, perf_pending_event); @@ -6683,6 +6866,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  err_pmu:  	if (event->destroy)  		event->destroy(event); +	module_put(pmu->module);  err_ns:  	if (event->ns)  		put_pid_ns(event->ns); @@ -6822,7 +7006,7 @@ err_size:  static int  perf_event_set_output(struct perf_event *event, struct perf_event *output_event)  { -	struct ring_buffer *rb = NULL, *old_rb = NULL; +	struct ring_buffer *rb = NULL;  	int ret = -EINVAL;  	if (!output_event) @@ -6850,8 +7034,6 @@ set:  	if (atomic_read(&event->mmap_count))  		goto unlock; -	old_rb = event->rb; -  	if (output_event) {  		/* get the rb we want to redirect to */  		rb = ring_buffer_get(output_event); @@ -6859,23 +7041,7 @@ set:  			goto unlock;  	} -	if (old_rb) -		ring_buffer_detach(event, old_rb); - -	if (rb) -		ring_buffer_attach(event, rb); - -	rcu_assign_pointer(event->rb, rb); - -	if (old_rb) { -		ring_buffer_put(old_rb); -		/* -		 * Since we detached before setting the new rb, so that we -		 * could attach the new rb, we could have missed a wakeup. -		 * Provide it now. -		 */ -		wake_up_all(&event->waitq); -	} +	ring_buffer_attach(event, rb);  	ret = 0;  unlock: @@ -6908,6 +7074,7 @@ SYSCALL_DEFINE5(perf_event_open,  	int event_fd;  	int move_group = 0;  	int err; +	int f_flags = O_RDWR;  	/* for future expandability... */  	if (flags & ~PERF_FLAG_ALL) @@ -6925,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open,  	if (attr.freq) {  		if (attr.sample_freq > sysctl_perf_event_sample_rate)  			return -EINVAL; +	} else { +		if (attr.sample_period & (1ULL << 63)) +			return -EINVAL;  	}  	/* @@ -6936,7 +7106,10 @@ SYSCALL_DEFINE5(perf_event_open,  	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))  		return -EINVAL; -	event_fd = get_unused_fd(); +	if (flags & PERF_FLAG_FD_CLOEXEC) +		f_flags |= O_CLOEXEC; + +	event_fd = get_unused_fd_flags(f_flags);  	if (event_fd < 0)  		return event_fd; @@ -6959,20 +7132,33 @@ SYSCALL_DEFINE5(perf_event_open,  		}  	} +	if (task && group_leader && +	    group_leader->attr.inherit != attr.inherit) { +		err = -EINVAL; +		goto err_task; +	} +  	get_online_cpus();  	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,  				 NULL, NULL);  	if (IS_ERR(event)) {  		err = PTR_ERR(event); -		goto err_task; +		goto err_cpus;  	}  	if (flags & PERF_FLAG_PID_CGROUP) {  		err = perf_cgroup_connect(pid, event, &attr, group_leader);  		if (err) {  			__free_event(event); -			goto err_task; +			goto err_cpus; +		} +	} + +	if (is_sampling_event(event)) { +		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { +			err = -ENOTSUPP; +			goto err_alloc;  		}  	} @@ -7058,7 +7244,8 @@ SYSCALL_DEFINE5(perf_event_open,  			goto err_context;  	} -	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); +	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, +					f_flags);  	if (IS_ERR(event_file)) {  		err = PTR_ERR(event_file);  		goto err_context; @@ -7068,7 +7255,7 @@ SYSCALL_DEFINE5(perf_event_open,  		struct perf_event_context *gctx = group_leader->ctx;  		mutex_lock(&gctx->mutex); -		perf_remove_from_context(group_leader); +		perf_remove_from_context(group_leader, false);  		/*  		 * Removing from the context ends up with disabled @@ -7078,7 +7265,7 @@ SYSCALL_DEFINE5(perf_event_open,  		perf_event__state_init(group_leader);  		list_for_each_entry(sibling, &group_leader->sibling_list,  				    group_entry) { -			perf_remove_from_context(sibling); +			perf_remove_from_context(sibling, false);  			perf_event__state_init(sibling);  			put_ctx(gctx);  		} @@ -7101,7 +7288,6 @@ SYSCALL_DEFINE5(perf_event_open,  	}  	perf_install_in_context(ctx, event, event->cpu); -	++ctx->generation;  	perf_unpin_context(ctx);  	mutex_unlock(&ctx->mutex); @@ -7134,8 +7320,9 @@ err_context:  	put_ctx(ctx);  err_alloc:  	free_event(event); -err_task: +err_cpus:  	put_online_cpus(); +err_task:  	if (task)  		put_task_struct(task);  err_group_fd: @@ -7184,7 +7371,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,  	WARN_ON_ONCE(ctx->parent_ctx);  	mutex_lock(&ctx->mutex);  	perf_install_in_context(ctx, event, cpu); -	++ctx->generation;  	perf_unpin_context(ctx);  	mutex_unlock(&ctx->mutex); @@ -7210,18 +7396,18 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)  	mutex_lock(&src_ctx->mutex);  	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,  				 event_entry) { -		perf_remove_from_context(event); +		perf_remove_from_context(event, false);  		unaccount_event_cpu(event, src_cpu);  		put_ctx(src_ctx); -		list_add(&event->event_entry, &events); +		list_add(&event->migrate_entry, &events);  	}  	mutex_unlock(&src_ctx->mutex);  	synchronize_rcu();  	mutex_lock(&dst_ctx->mutex); -	list_for_each_entry_safe(event, tmp, &events, event_entry) { -		list_del(&event->event_entry); +	list_for_each_entry_safe(event, tmp, &events, migrate_entry) { +		list_del(&event->migrate_entry);  		if (event->state >= PERF_EVENT_STATE_OFF)  			event->state = PERF_EVENT_STATE_INACTIVE;  		account_event_cpu(event, dst_cpu); @@ -7272,13 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,  			 struct perf_event_context *child_ctx,  			 struct task_struct *child)  { -	if (child_event->parent) { -		raw_spin_lock_irq(&child_ctx->lock); -		perf_group_detach(child_event); -		raw_spin_unlock_irq(&child_ctx->lock); -	} - -	perf_remove_from_context(child_event); +	/* +	 * Do not destroy the 'original' grouping; because of the context +	 * switch optimization the original events could've ended up in a +	 * random child task. +	 * +	 * If we were to destroy the original group, all group related +	 * operations would cease to function properly after this random +	 * child dies. +	 * +	 * Do destroy all inherited groups, we don't care about those +	 * and being thorough is better. +	 */ +	perf_remove_from_context(child_event, !!child_event->parent);  	/*  	 * It can happen that the parent exits first, and has events @@ -7293,8 +7485,8 @@ __perf_event_exit_task(struct perf_event *child_event,  static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  { -	struct perf_event *child_event, *tmp; -	struct perf_event_context *child_ctx; +	struct perf_event *child_event, *next; +	struct perf_event_context *child_ctx, *parent_ctx;  	unsigned long flags;  	if (likely(!child->perf_event_ctxp[ctxn])) { @@ -7319,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  	raw_spin_lock(&child_ctx->lock);  	task_ctx_sched_out(child_ctx);  	child->perf_event_ctxp[ctxn] = NULL; + +	/* +	 * In order to avoid freeing: child_ctx->parent_ctx->task +	 * under perf_event_context::lock, grab another reference. +	 */ +	parent_ctx = child_ctx->parent_ctx; +	if (parent_ctx) +		get_ctx(parent_ctx); +  	/*  	 * If this context is a clone; unclone it so it can't get  	 * swapped to another process while we're removing all @@ -7329,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);  	/* +	 * Now that we no longer hold perf_event_context::lock, drop +	 * our extra child_ctx->parent_ctx reference. +	 */ +	if (parent_ctx) +		put_ctx(parent_ctx); + +	/*  	 * Report the task dead after unscheduling the events so that we  	 * won't get any samples after PERF_RECORD_EXIT. We can however still  	 * get a few PERF_RECORD_READ events. @@ -7347,24 +7555,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  	 */  	mutex_lock(&child_ctx->mutex); -again: -	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, -				 group_entry) -		__perf_event_exit_task(child_event, child_ctx, child); - -	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, -				 group_entry) +	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)  		__perf_event_exit_task(child_event, child_ctx, child); -	/* -	 * If the last event was a group event, it will have appended all -	 * its siblings to the list, but we obtained 'tmp' before that which -	 * will still point to the list head terminating the iteration. -	 */ -	if (!list_empty(&child_ctx->pinned_groups) || -	    !list_empty(&child_ctx->flexible_groups)) -		goto again; -  	mutex_unlock(&child_ctx->mutex);  	put_ctx(child_ctx); @@ -7629,6 +7822,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)  	 * swapped under us.  	 */  	parent_ctx = perf_pin_task_context(parent, ctxn); +	if (!parent_ctx) +		return 0;  	/*  	 * No need to check if parent_ctx != NULL here; since we saw @@ -7740,6 +7935,7 @@ static void perf_event_init_cpu(int cpu)  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);  	mutex_lock(&swhash->hlist_mutex); +	swhash->online = true;  	if (swhash->hlist_refcount > 0) {  		struct swevent_hlist *hlist; @@ -7762,15 +7958,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)  static void __perf_event_exit_context(void *__info)  { +	struct remove_event re = { .detach_group = false };  	struct perf_event_context *ctx = __info; -	struct perf_event *event, *tmp;  	perf_pmu_rotate_stop(ctx->pmu); -	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) -		__perf_remove_from_context(event); -	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) -		__perf_remove_from_context(event); +	rcu_read_lock(); +	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) +		__perf_remove_from_context(&re); +	rcu_read_unlock();  }  static void perf_event_exit_cpu_context(int cpu) @@ -7794,11 +7990,12 @@ static void perf_event_exit_cpu(int cpu)  {  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); +	perf_event_exit_cpu_context(cpu); +  	mutex_lock(&swhash->hlist_mutex); +	swhash->online = false;  	swevent_hlist_release(swhash);  	mutex_unlock(&swhash->hlist_mutex); - -	perf_event_exit_cpu_context(cpu);  }  #else  static inline void perf_event_exit_cpu(int cpu) { } @@ -7943,7 +8140,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,  {  	struct task_struct *task; -	cgroup_taskset_for_each(task, css, tset) +	cgroup_taskset_for_each(task, tset)  		task_function_call(task, __perf_cgroup_move, task);  } @@ -7962,9 +8159,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,  	task_function_call(task, __perf_cgroup_move, task);  } -struct cgroup_subsys perf_subsys = { -	.name		= "perf_event", -	.subsys_id	= perf_subsys_id, +struct cgroup_subsys perf_event_cgrp_subsys = {  	.css_alloc	= perf_cgroup_css_alloc,  	.css_free	= perf_cgroup_css_free,  	.exit		= perf_cgroup_exit,  | 
