diff options
author | Ingo Molnar <mingo@elte.hu> | 2010-03-04 11:47:50 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-03-04 11:47:52 +0100 |
commit | 4f16d4e0c9a4b20d9f0db365587b96d6001efd7d (patch) | |
tree | fa25dcf285b26f1fac2bf267d0d1cd2c4eba90b8 /kernel | |
parent | 1e259e0a9982078896f3404240096cbea01daca4 (diff) | |
parent | 6630125419ef37ff8781713c5e9d416f2a4ba357 (diff) |
Merge branch 'perf/core' into perf/urgent
Merge reason: Switch from pre-merge topical split to the post-merge urgent track
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/futex.c | 30 | ||||
-rw-r--r-- | kernel/hw_breakpoint.c | 10 | ||||
-rw-r--r-- | kernel/kfifo.c | 3 | ||||
-rw-r--r-- | kernel/kgdb.c | 6 | ||||
-rw-r--r-- | kernel/kprobes.c | 34 | ||||
-rw-r--r-- | kernel/perf_event.c | 642 | ||||
-rw-r--r-- | kernel/sched.c | 12 | ||||
-rw-r--r-- | kernel/softirq.c | 15 | ||||
-rw-r--r-- | kernel/softlockup.c | 15 | ||||
-rw-r--r-- | kernel/sys.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/trace/Makefile | 4 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 54 | ||||
-rw-r--r-- | kernel/trace/trace_event_profile.c | 52 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 198 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 24 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 76 |
18 files changed, 666 insertions, 517 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9..e7a35f1039e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); @@ -1971,7 +1995,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 4d99512ee14..03808ed342a 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * * @return a set of per_cpu pointers to perf events */ -struct perf_event ** +struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { - struct perf_event **cpu_events, **pevent, *bp; + struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return ERR_PTR(-ENOMEM); + return (void __percpu __force *)ERR_PTR(-ENOMEM); get_online_cpus(); for_each_online_cpu(cpu) { @@ -451,7 +451,7 @@ fail: put_online_cpus(); free_percpu(cpu_events); - return ERR_PTR(err); + return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ -void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; struct perf_event **pevent; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 498cabba225..35edbe22e9a 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0); + _kfifo_init(fifo, NULL, 0); return -ENOMEM; } @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); diff --git a/kernel/kgdb.c b/kernel/kgdb.c index c7ade62e4ef..761fdd2b303 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -599,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1453,7 +1453,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1553,7 +1553,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7df302a020..ccec774c716 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -44,6 +44,7 @@ #include <linux/debugfs.h> #include <linux/kdebug.h> #include <linux/memory.h> +#include <linux/ftrace.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> @@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { {"native_get_debugreg",}, {"irq_entries_start",}, {"common_interrupt",}, + {"mcount",}, /* mcount can be called from everywhere */ {NULL} /* Terminator */ }; @@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages); static int kprobe_garbage_slots; static int collect_garbage_slots(void); -static int __kprobes check_safety(void) -{ - int ret = 0; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER) - ret = freeze_processes(); - if (ret == 0) { - struct task_struct *p, *q; - do_each_thread(p, q) { - if (p != current && p->state == TASK_RUNNING && - p->pid != 0) { - printk("Check failed: %s is running\n",p->comm); - ret = -1; - goto loop_end; - } - } while_each_thread(p, q); - } -loop_end: - thaw_processes(); -#else - synchronize_sched(); -#endif - return ret; -} - /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. @@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip, *next; - /* Ensure no-one is preepmted on the garbages */ - if (check_safety()) - return -EAGAIN; + /* Ensure no-one is interrupted on the garbages */ + synchronize_sched(); list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { int i; @@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) { + in_kprobes_functions((unsigned long) p->addr) || + ftrace_text_reserved(p->addr, p->addr)) { preempt_enable(); return -EINVAL; } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2ae7409bf38..482d5e1d376 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly; */ int sysctl_perf_event_paranoid __read_mostly = 1; -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_event_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_event_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_event_paranoid > 1; -} - int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* @@ -98,11 +83,12 @@ void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_event_setup(int cpu) { barrier(); } void __weak hw_perf_event_setup_online(int cpu) { barrier(); } +void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) + struct perf_event_context *ctx) { return 0; } @@ -248,7 +234,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(smp_processor_id()); + return cpu_clock(raw_smp_processor_id()); } /* @@ -289,6 +275,15 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -303,9 +298,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == event) - list_add_tail(&event->group_entry, &ctx->group_list); - else { + if (group_leader == event) { + struct list_head *list; + + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } else { + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -355,9 +360,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * to the context list directly: */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + struct list_head *list; - list_move_tail(&sibling->group_entry, &ctx->group_list); + list = ctx_group_list(event, ctx); + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; } } @@ -608,14 +618,13 @@ void perf_event_disable(struct perf_event *event) static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - int cpu) + struct perf_event_context *ctx) { if (event->state <= PERF_EVENT_STATE_OFF) return 0; event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + event->oncpu = smp_processor_id(); /* * The new state must be visible before we turn it on in the hardware: */ @@ -642,8 +651,7 @@ event_sched_in(struct perf_event *event, static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - int cpu) + struct perf_event_context *ctx) { struct perf_event *event, *partial_group; int ret; @@ -651,18 +659,18 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); if (ret) return ret < 0 ? ret : 0; - if (event_sched_in(group_event, cpuctx, ctx, cpu)) + if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; /* * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx, cpu)) { + if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } @@ -686,24 +694,6 @@ group_error: } /* - * Return 1 for a group consisting entirely of software events, - * 0 if the group contains any hardware events. - */ -static int is_software_only_group(struct perf_event *leader) -{ - struct perf_event *event; - - if (!is_software_event(leader)) - return 0; - - list_for_each_entry(event, &leader->sibling_list, group_entry) - if (!is_software_event(event)) - return 0; - - return 1; -} - -/* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, @@ -713,7 +703,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(event)) + if (event->group_flags & PERF_GROUP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -754,7 +744,6 @@ static void __perf_install_in_context(void *info) struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - int cpu = smp_processor_id(); int err; /* @@ -801,7 +790,7 @@ static void __perf_install_in_context(void *info) if (!group_can_go_on(event, cpuctx, 1)) err = -EEXIST; else - err = event_sched_in(event, cpuctx, ctx, cpu); + err = event_sched_in(event, cpuctx, ctx); if (err) { /* @@ -943,11 +932,9 @@ static void __perf_event_enable(void *info) } else { perf_disable(); if (event == leader) - err = group_sched_in(event, cpuctx, ctx, - smp_processor_id()); + err = group_sched_in(event, cpuctx, ctx); else - err = event_sched_in(event, cpuctx, ctx, - smp_processor_id()); + err = event_sched_in(event, cpuctx, ctx); perf_enable(); } @@ -1043,8 +1030,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -void __perf_event_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) { struct perf_event *event; @@ -1055,10 +1049,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) + if (!ctx->nr_active) + goto out_enable; + + if (event_type & EVENT_PINNED) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); - } + + if (event_type & EVENT_FLEXIBLE) + list_for_each_entry(event, &ctx->flexible_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + + out_enable: perf_enable(); out: raw_spin_unlock(&ctx->lock); @@ -1170,9 +1172,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, * not restart the event. */ void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; @@ -1220,15 +1222,13 @@ void perf_event_task_sched_out(struct task_struct *task, rcu_read_unlock(); if (do_switch) { - __perf_event_sched_out(ctx, cpuctx); + ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; } } -/* - * Called with IRQs disabled - */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); @@ -1238,47 +1238,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx) if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - __perf_event_sched_out(ctx, cpuctx); + ctx_sched_out(ctx, cpuctx, event_type); cpuctx->task_ctx = NULL; } /* * Called with IRQs disabled */ -static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + task_ctx_sched_out(ctx, EVENT_ALL); +} + +/* + * Called with IRQs disabled + */ +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) { - __perf_event_sched_out(&cpuctx->ctx, cpuctx); + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } static void -__perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) { struct perf_event *event; - int can_add_hw = 1; - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_events)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF || - !event->attr.pinned) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != cpu) + if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); + group_sched_in(event, cpuctx, ctx); /* * If this pinned group hasn't been scheduled, @@ -1289,32 +1283,83 @@ __perf_event_sched_in(struct perf_event_context *ctx, event->state = PERF_EVENT_STATE_ERROR; } } +} - list_for_each_entry(event, &ctx->group_list, group_entry) { - /* - * Ignore events in OFF or ERROR state, and - * ignore pinned events since we did them already. - */ - if (event->state <= PERF_EVENT_STATE_OFF || - event->attr.pinned) - continue; +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + int can_add_hw = 1; + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) + continue; /* * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != cpu) + if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; if (group_can_go_on(event, cpuctx, can_add_hw)) - if (group_sched_in(event, cpuctx, ctx, cpu)) + if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; } +} + +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + if (event_type & EVENT_PINNED) + ctx_pinned_sched_in(ctx, cpuctx); + + /* Then walk through the lower prio flexible groups */ + if (event_type & EVENT_FLEXIBLE) + ctx_flexible_sched_in(ctx, cpuctx); + perf_enable(); out: raw_spin_unlock(&ctx->lock); } +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + ctx_sched_in(ctx, cpuctx, event_type); +} + +static void task_ctx_sched_in(struct task_struct *task, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + ctx_sched_in(ctx, cpuctx, event_type); + cpuctx->task_ctx = ctx; +} /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -1326,38 +1371,128 @@ __perf_event_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; + if (cpuctx->task_ctx == ctx) return; - __perf_event_sched_in(ctx, cpuctx, cpu); + + /* + * We want to keep the following priority order: + * cpu pinned (that don't need to move), task pinned, + * cpu flexible, task flexible. + */ + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + cpuctx->task_ctx = ctx; } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { - struct perf_event_context *ctx = &cpuctx->ctx; + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } - __perf_event_sched_in(ctx, cpuctx, cpu); + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + return div64_u64(dividend, divisor); } -#define MAX_INTERRUPTS (~0ULL) +static void perf_event_stop(struct perf_event *event) +{ + if (!event->pmu->stop) + return event->pmu->disable(event); -static void perf_log_throttle(struct perf_event *event, int enable); + return event->pmu->stop(event); +} + +static int perf_event_start(struct perf_event *event) +{ + if (!event->pmu->start) + return event->pmu->enable(event); -static void perf_adjust_period(struct perf_event *event, u64 events) + return event->pmu->start(event); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; - events *= hwc->sample_period; - period = div64_u64(events, event->attr.sample_freq); + period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1368,13 +1503,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events) sample_period = 1; hwc->sample_period = sample_period; + + if (atomic64_read(&hwc->period_left) > 8*sample_period) { + perf_disable(); + perf_event_stop(event); + atomic64_set(&hwc->period_left, 0); + perf_event_start(event); + perf_enable(); + } } static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, freq; + u64 interrupts, now; + s64 delta; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { @@ -1395,44 +1539,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); event->pmu->unthrottle(event); - interrupts = 2*sysctl_perf_event_sample_rate/HZ; } if (!event->attr.freq || !event->attr.sample_freq) continue; - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (event->attr.sample_freq < HZ) { - freq = event->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(event, freq * interrupts); + event->pmu->read(event); + now = atomic64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - event->pmu->disable(event); - atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); - perf_enable(); - } + if (delta > 0) + perf_adjust_period(event, TICK_NSEC, delta); } raw_spin_unlock(&ctx->lock); } @@ -1442,26 +1560,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ static void rotate_ctx(struct perf_event_context *ctx) { - struct perf_event *event; - if (!ctx->nr_events) return; raw_spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group events too): - */ - perf_disable(); - list_for_each_entry(event, &ctx->group_list, group_entry) { - list_move_tail(&event->group_entry, &ctx->group_list); - break; - } - perf_enable(); + + /* Rotate the first entry last of non-pinned groups */ + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; @@ -1469,24 +1579,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (!atomic_read(&nr_events)) return; - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; + perf_disable(); + perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); - perf_event_cpu_sched_out(cpuctx); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx, cpu); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - perf_event_task_sched_in(curr, cpu); + task_ctx_sched_in(curr, EVENT_FLEXIBLE); + + perf_enable(); +} + +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; } /* @@ -1499,6 +1628,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) struct perf_event *event; unsigned long flags; int enabled = 0; + int ret; local_irq_save(flags); ctx = task->perf_event_ctxp; @@ -1509,14 +1639,16 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (!event->attr.enable_on_exec) - continue; - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - continue; - __perf_event_mark_enabled(event, ctx); - enabled = 1; + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + |