diff options
Diffstat (limited to 'arch/x86/kernel/cpu/perf_event.c')
| -rw-r--r-- | arch/x86/kernel/cpu/perf_event.c | 728 |
1 files changed, 591 insertions, 137 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b1..2879ecdaac4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,27 +24,19 @@ #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> +#include <linux/device.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> -#include <asm/compat.h> #include <asm/smp.h> #include <asm/alternative.h> +#include <asm/timer.h> +#include <asm/desc.h> +#include <asm/ldt.h> #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - struct x86_pmu x86_pmu __read_mostly; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { @@ -73,7 +65,7 @@ u64 x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -85,7 +77,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -126,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; + /* Check if the extra msrs can be safely accessed*/ + if (!er->extra_msr_access) + return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; @@ -188,8 +183,9 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_new = 0; - int i, reg, ret = 0; + u64 val, val_fail, val_new= ~0; + int i, reg, reg_fail, ret = 0; + int bios_fail = 0; /* * Check to see if the BIOS enabled any of the counters, if so @@ -200,8 +196,11 @@ static bool check_hw_exists(void) ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) - goto bios_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { + bios_fail = 1; + val_fail = val; + reg_fail = reg; + } } if (x86_pmu.num_counters_fixed) { @@ -210,35 +209,41 @@ static bool check_hw_exists(void) if (ret) goto msr_fail; for (i = 0; i < x86_pmu.num_counters_fixed; i++) { - if (val & (0x03 << i*4)) - goto bios_fail; + if (val & (0x03 << i*4)) { + bios_fail = 1; + val_fail = val; + reg_fail = reg; + } } } /* - * Now write a value and read it back to see if it matches, - * this is needed to detect certain hardware emulators (qemu/kvm) - * that don't trap on the MSR access and always return 0s. + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu_event_addr(0), val); - ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; - return true; - -bios_fail: /* * We still allow the PMU driver to operate: */ - printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); - printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); + if (bios_fail) { + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); + } return true; msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } @@ -301,15 +306,6 @@ int x86_setup_perfctr(struct perf_event *event) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); - } else { - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * events (user-space has to fall back and - * sample via a hrtimer based software event): - */ - if (!x86_pmu.apic) - return -EOPNOTSUPP; } if (attr->type == PERF_TYPE_RAW) @@ -351,13 +347,43 @@ int x86_setup_perfctr(struct perf_event *event) return 0; } +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ + u64 m = event->attr.branch_sample_type; + u64 b = 0; + + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; + + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; + + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ + + return m == b; +} + int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -367,6 +393,37 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } } /* @@ -423,6 +480,7 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } @@ -505,7 +563,7 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; - struct event_constraint **constraints; + struct perf_event **events; struct sched_state state; int saved_states; struct sched_state saved[SCHED_STATES_MAX]; @@ -514,7 +572,7 @@ struct perf_sched { /* * Initialize interator that runs through all events and counters. */ -static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, +static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, int num, int wmin, int wmax) { int idx; @@ -522,10 +580,10 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint ** memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; - sched->constraints = c; + sched->events = events; for (idx = 0; idx < num; idx++) { - if (c[idx]->weight == wmin) + if (events[idx]->hw.constraint->weight == wmin) break; } @@ -572,19 +630,18 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (sched->state.event >= sched->max_events) return false; - c = sched->constraints[sched->state.event]; - + c = sched->events[sched->state.event]->hw.constraint; /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { - idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } @@ -631,7 +688,7 @@ static bool perf_sched_next_event(struct perf_sched *sched) if (sched->state.weight > sched->max_weight) return false; } - c = sched->constraints[sched->state.event]; + c = sched->events[sched->state.event]->hw.constraint; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ @@ -642,12 +699,12 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct perf_event **events, int n, + int wmin, int wmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, constraints, n, wmin, wmax); + perf_sched_init(&sched, events, n, wmin, wmax); do { if (!perf_sched_find_counter(&sched)) @@ -658,19 +715,23 @@ static int perf_assign_events(struct event_constraint **constraints, int n, return sched.state.unassigned; } +EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *e; int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); - constraints[i] = c; + hwc->constraint = c; + wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -680,7 +741,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = constraints[i]; + c = hwc->constraint; /* never assigned */ if (hwc->idx == -1) @@ -701,16 +762,35 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) - num = perf_assign_events(constraints, n, wmin, wmax, assign); + num = perf_assign_events(cpuc->event_list, n, wmin, + wmax, assign); /* + * Mark the event as committed, so we do not put_constraint() + * in case new events are added and fail scheduling. + */ + if (!num && assign) { + for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + e->hw.flags |= PERF_X86_EVENT_COMMITTED; + } + } + /* * scheduling failed or is just a simulation, * free resources if necessary */ if (!assign || num) { for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + /* + * do not put_constraint() on comitted events, + * because they are good to go + */ + if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) + continue; + if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + x86_pmu.put_event_constraints(cpuc, e); } } return num ? -EINVAL : 0; @@ -762,15 +842,17 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); } } @@ -805,7 +887,6 @@ static void x86_pmu_enable(struct pmu *pmu) * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters - * step2: reprogram moved events into new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; @@ -831,6 +912,9 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu_stop(event, PERF_EF_UPDATE); } + /* + * step2: reprogram moved events into new counters + */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; @@ -868,7 +952,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -956,7 +1040,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed - * at commit time (->commit_txn) as a whole + * at commit time (->commit_txn) as a whole. */ if (cpuc->group_flag & PERF_EVENT_TXN) goto done_collect; @@ -971,6 +1055,10 @@ static int x86_pmu_add(struct perf_event *event, int flags) memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: + /* + * Commit the collect_events() state. See x86_pmu_del() and + * x86_pmu_*_txn(). + */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; @@ -1088,28 +1176,46 @@ static void x86_pmu_del(struct perf_event *event, int flags) int i; /* + * event is descheduled + */ + event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; + + /* * If we're called during a txn, we don't need to do anything. * The events never got scheduled and ->cancel_txn will truncate * the event_list. + * + * XXX assumes any ->del() called during a TXN will only be on + * an event added during that same TXN. */ if (cpuc->group_flag & PERF_EVENT_TXN) return; + /* + * Not a TXN, therefore cleanup properly. + */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { - if (event == cpuc->event_list[i]) { + if (event == cpuc->event_list[i]) + break; + } - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); + if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ + return; - while (++i < cpuc->n_events) - cpuc->event_list[i-1] = cpuc->event_list[i]; + /* If we have a newly added event; make sure to decrease n_added. */ + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + /* Delete the array entry. */ + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + --cpuc->n_events; - --cpuc->n_events; - break; - } - } perf_event_update_userpage(event); } @@ -1121,8 +1227,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1157,7 +1261,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; @@ -1183,19 +1287,30 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -static int __kprobes +static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { + u64 start_clock; + u64 finish_clock; + int ret; + if (!atomic_read(&active_events)) return NMI_DONE; - return x86_pmu.handle_irq(regs); + start_clock = sched_clock(); + ret = x86_pmu.handle_irq(regs); + finish_clock = sched_clock(); + + perf_sample_event_took(finish_clock - start_clock); + + return ret; } +NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; -static int __cpuinit +static int x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -1210,6 +1325,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1244,12 +1361,163 @@ static void __init pmu_check_apic(void) x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); + + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + +} + +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) +{ + struct device_attribute *d; + struct perf_pmu_events_attr *pmu_attr; + int i, j; + + for (i = 0; attrs[i]; i++) { + d = (struct device_attribute *)attrs[i]; + pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); + /* str trumps id */ + if (pmu_attr->event_str) + continue; + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + +/* Merge two pointer arrays */ +static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +{ + struct attribute **new; + int j, i; + + for (j = 0; a[j]; j++) + ; + for (i = 0; b[i]; i++) + j++; + j++; + + new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); + if (!new) + return NULL; + + j = 0; + for (i = 0; a[i]; i++) + new[j++] = a[i]; + for (i = 0; b[i]; i++) + new[j++] = b[i]; + new[j] = NULL; + + return new; +} + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + u64 config = x86_pmu.event_map(pmu_attr->id); + + /* string trumps id */ + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); + + return x86_pmu.events_sysfs_show(page, config); +} + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; - struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1262,7 +1530,7 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return 0; + err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); @@ -1277,46 +1545,37 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters, 0); + 0, x86_pmu.num_counters, 0, 0); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { - continue; - } + x86_pmu_format_group.attrs = x86_pmu.format_attrs; - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } + if (x86_pmu.event_attrs) + x86_pmu_events_group.attrs = x86_pmu.event_attrs; + + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + + if (x86_pmu.cpu_events) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); + if (!WARN_ON(!tmp)) + x86_pmu_events_group.attrs = tmp; } pr_info("... version: %d\n", x86_pmu.version); @@ -1360,7 +1619,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) { __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* - * Truncate the collected events. + * Truncate collected array by the number of events added in this + * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); @@ -1371,6 +1631,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { @@ -1426,6 +1688,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); @@ -1542,23 +1805,138 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } +static int x86_pmu_event_idx(struct perf_event *event) +{ + int idx = event->hw.idx; + + if (!x86_pmu.attr_rdpmc) + return 0; + + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; + idx |= 1 << 30; + } + + return idx + 1; +} + +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (x86_pmu.attr_rdpmc_broken) + return -ENOTSUPP; + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + on_each_cpu(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + &x86_pmu_format_group, + &x86_pmu_events_group, + NULL, +}; + +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, + + .attr_groups = x86_pmu_attr_groups, + + .event_init = x86_pmu_event_init, - .event_init = x86_pmu_event_init, + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, + .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, }; +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +{ + struct cyc2ns_data *data; + + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + + if (!sched_clock_stable()) + return; + + data = cyc2ns_read_begin(); + + userpg->cap_user_time = 1; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; + + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); +} + /* * callchain support */ @@ -1594,32 +1972,68 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + +static unsigned long get_segment_base(unsigned int segment) +{ + struct desc_struct *desc; + int idx = segment >> 3; + + if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { + if (idx > LDT_ENTRIES) + return 0; + + if (idx > current->active_mm->context.size) + return 0; + + desc = current->active_mm->context.ldt; + } else { + if (idx > GDT_ENTRIES) + return 0; + + desc = __this_cpu_ptr(&gdt_page.gdt[0]); + } + + return get_desc_base(desc + idx); +} + #ifdef CONFIG_COMPAT + +#include <asm/compat.h> + static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { /* 32-bit process in 64-bit kernel. */ + unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const void __user *fp; if (!test_thread_flag(TIF_IA32)) return 0; - fp = compat_ptr(regs->bp); + cs_base = get_segment_base(regs->cs); + ss_base = get_segment_base(regs->ss); + + fp = compat_ptr(ss_base + regs->bp); while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; - if (fp < compat_ptr(regs->sp)) + if (!valid_user_frame(fp, sizeof(frame))) break; - perf_callchain_store(entry, frame.return_address); - fp = compat_ptr(frame.next_frame); + perf_callchain_store(entry, cs_base + frame.return_address); + fp = compat_ptr(ss_base + frame.next_frame); } return 1; } @@ -1642,6 +2056,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; } + /* + * We don't know what to do with VM86 stacks.. ignore them for now. + */ + if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) + return; + fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -1658,10 +2078,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; - if ((unsigned long)fp < regs->sp) + if (!valid_user_frame(fp, sizeof(frame))) break; perf_callchain_store(entry, frame.return_address); @@ -1669,16 +2089,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +/* + * Deal with code segment offsets for the various execution modes: + * + * VM86 - the good olde 16 bit days, where the linear address is + * 20 bits and we use regs->ip + 0x10 * regs->cs. + * + * IA32 - Where we need to look at GDT/LDT segment descriptor tables + * to figure out what the 32bit base address is. + * + * X32 - has TIF_X32 set, but is running in x86_64 + * + * X86_64 - CS,DS,SS,ES are all zero based. + */ +static unsigned long code_segment_base(struct pt_regs *regs) { - unsigned long ip; + /* + * If we are in VM86 mode, add the segment offset to convert to a + * linear address. + */ + if (regs->flags & X86_VM_MASK) + return 0x10 * regs->cs; + + /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ +#ifdef CONFIG_X86_32 + if (user_mode(regs) && regs->cs != __USER_CS) + return get_segment_base(regs->cs); +#else + if (test_thread_flag(TIF_IA32)) { + if (user_mode(regs) && regs->cs != __USER32_CS) + return get_segment_base(regs->cs); + } +#endif + return 0; +} +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - ip = perf_guest_cbs->get_guest_ip(); - else - ip = instruction_pointer(regs); + return perf_guest_cbs->get_guest_ip(); - return ip; + return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) |
