diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 132 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 156 |
2 files changed, 205 insertions, 83 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 953c1434837..17b3c6cf160 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = - ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = + DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; void update_perf_cpu_limits(void) { @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) tmp *= sysctl_perf_cpu_time_max_percent; do_div(tmp, 100); - atomic_set(&perf_sample_allowed_ns, tmp); + ACCESS_ONCE(perf_sample_allowed_ns) = tmp; } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; u64 local_samples_len; + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - if (atomic_read(&perf_sample_allowed_ns) == 0) + if (allowed_ns == 0) return; /* decay the counter by 1 average sample */ @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) */ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + if (avg_local_sample_len <= allowed_ns) return; if (max_samples_per_tick <= 1) @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %d), lowering " + "perf samples too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, - atomic_read(&perf_sample_allowed_ns), + avg_local_sample_len, allowed_ns, sysctl_perf_event_sample_rate); update_perf_cpu_limits(); @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); ctx->parent_ctx = NULL; } + ctx->generation++; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + + ctx->generation++; } /* @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + size += sizeof(data->txn); + event->header_size = size; } @@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + + ctx->generation++; } static void perf_group_detach(struct perf_event *event) @@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, } /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; + /* Pinning disables the swap optimization */ + if (ctx1->pin_count || ctx2->pin_count) + return 0; + + /* If ctx1 is the parent of ctx2 */ + if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) + return 1; + + /* If ctx2 is the parent of ctx1 */ + if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) + return 1; + + /* + * If ctx1 and ctx2 have the same parent; we flatten the parent + * hierarchy, see perf_event_init_context(). + */ + if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && + ctx1->parent_gen == ctx2->parent_gen) + return 1; + + /* Unmatched */ + return 0; } static void __perf_event_sync_stat(struct perf_event *event, @@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; - struct perf_event_context *parent; + struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; @@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, return; rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { + if (!next_ctx) + goto unlock; + + parent = rcu_dereference(ctx->parent_ctx); + next_parent = rcu_dereference(next_ctx->parent_ctx); + + /* If neither context have a parent context; they cannot be clones. */ + if (!parent && !next_parent) + goto unlock; + + if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both @@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } +unlock: rcu_read_unlock(); if (do_switch) { @@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + perf_output_put(handle, data->txn); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5100,24 +5136,23 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); + char *name; if (file) { struct inode *inode; dev_t dev; - /* - * d_path works from the end of the rb backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; } - name = d_path(&file->f_path, buf, PATH_MAX); + /* + * d_path() works from the end of the rb backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; @@ -5130,21 +5165,19 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) min = MINOR(dev); } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp) - 1); + name = (char *)arch_vma_name(vma); + if (name) { + name = strncpy(tmp, name, sizeof(tmp) - 1); tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && + if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { name = strncpy(tmp, "[heap]", sizeof(tmp)); goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && + } + if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { name = strncpy(tmp, "[stack]", sizeof(tmp)); goto got_name; @@ -5155,7 +5188,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(name)+1; + while (!IS_ALIGNED(size, sizeof(u64))) + name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; @@ -7126,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open, } perf_install_in_context(ctx, event, event->cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7209,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70..ae9e1d2ef25 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -35,6 +35,7 @@ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> +#include <linux/task_work.h> #include <linux/uprobes.h> @@ -1096,21 +1097,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon } /* Slot allocation for XOL */ -static int xol_add_vma(struct xol_area *area) +static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - struct mm_struct *mm = current->mm; int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; - ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { - ret = area->vaddr; - goto fail; + if (!area->vaddr) { + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, + PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; + } } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, @@ -1120,30 +1122,19 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; - ret = 0; fail: up_write(&mm->mmap_sem); return ret; } -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) +static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; - struct xol_area *area; uprobe_opcode_t insn = UPROBE_SWBP_INSN; + struct xol_area *area; - area = mm->uprobes_state.xol_area; - if (area) - goto ret; - - area = kzalloc(sizeof(*area), GFP_KERNEL); + area = kmalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1155,13 +1146,14 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; - /* allocate first slot of task's xol_area for the return probes */ + area->vaddr = vaddr; + init_waitqueue_head(&area->wq); + /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); atomic_set(&area->slot_count, 1); - init_waitqueue_head(&area->wq); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); - if (!xol_add_vma(area)) + if (!xol_add_vma(mm, area)) return area; __free_page(area->page); @@ -1170,9 +1162,25 @@ static struct xol_area *get_xol_area(void) free_area: kfree(area); out: + return NULL; +} + +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) +{ + struct mm_struct *mm = current->mm; + struct xol_area *area; + + if (!mm->uprobes_state.xol_area) + __create_xol_area(0); + area = mm->uprobes_state.xol_area; - ret: - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ return area; } @@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) -{ - t->utask = NULL; -} - -/* * Allocate a uprobe_task object for the task if if necessary. * Called when the thread hits a breakpoint. * @@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void) return current->utask; } +static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) +{ + struct uprobe_task *n_utask; + struct return_instance **p, *o, *n; + + n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + if (!n_utask) + return -ENOMEM; + t->utask = n_utask; + + p = &n_utask->return_instances; + for (o = o_utask->return_instances; o; o = o->next) { + n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!n) + return -ENOMEM; + + *n = *o; + atomic_inc(&n->uprobe->ref); + n->next = NULL; + + *p = n; + p = &n->next; + n_utask->depth++; + } + + return 0; +} + +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", + current->comm, current->pid, msg); +} + +static void dup_xol_work(struct callback_head *work) +{ + kfree(work); + + if (current->flags & PF_EXITING) + return; + + if (!__create_xol_area(current->utask->vaddr)) + uprobe_warn(current, "dup xol area"); +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t, unsigned long flags) +{ + struct uprobe_task *utask = current->utask; + struct mm_struct *mm = current->mm; + struct callback_head *work; + struct xol_area *area; + + t->utask = NULL; + + if (!utask || !utask->return_instances) + return; + + if (mm == t->mm && !(flags & CLONE_VFORK)) + return; + + if (dup_utask(t, utask)) + return uprobe_warn(t, "dup ret instances"); + + /* The task can fork() after dup_xol_work() fails */ + area = mm->uprobes_state.xol_area; + if (!area) + return uprobe_warn(t, "dup xol area"); + + if (mm == t->mm) + return; + + /* TODO: move it into the union in uprobe_task */ + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return uprobe_warn(t, "dup xol area"); + + utask->vaddr = area->vaddr; + init_task_work(work, dup_xol_work); + task_work_add(t, work, true); +} + /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. |