diff options
Diffstat (limited to 'kernel')
69 files changed, 4540 insertions, 3109 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e898c5b9d02..f70396e5a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,16 +2,15 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ +obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o sched_clock.o cred.o \ - async.o range.o -obj-y += groups.o + notifier.o ksysfs.o cred.o \ + async.o range.o groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg -CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +obj-y += sched/ + obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o @@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ -obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o @@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) -# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is -# needed for x86 only. Why this used to be enabled for all architectures is beyond -# me. I suspect most platforms don't need this, but until we know that for sure -# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k -# to get a correct value for the wait-channel (WCHAN in ps). --davidm -CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer -endif - $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2dd..203dfead2e0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); - pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_utime += current->utime; + pacct->ac_stime += current->stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9d5648f3cd..a184470cf9b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2098,11 +2098,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) continue; /* get old css_set pointer */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - /* ignore this task if it's going away */ - task_unlock(tsk); - continue; - } oldcg = tsk->cgroups; get_css_set(oldcg); task_unlock(tsk); diff --git a/kernel/cpu.c b/kernel/cpu.c index 563f1360947..5ca38d5d238 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (!cputime_eq(p->utime, cputime_zero) || - !cputime_eq(p->stime, cputime_zero))) + (p->utime || p->stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, @@ -380,6 +379,7 @@ out: cpu_maps_update_done(); return err; } +EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fe58c46a42..0b1712dba58 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +#ifdef CONFIG_NUMA +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return task->mempolicy; +} +#else +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return false; +} +#endif + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, @@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + bool need_loop; repeat: /* @@ -962,6 +975,14 @@ repeat: return; task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing + * get_mems_allowed(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -981,11 +1002,9 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. No wait is necessary, however, if at least one - * node remains unchanged. + * for the read-side. */ - while (masks_disjoint && - ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 5532dd37aa8..7d6fb40d218 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) (p->exit_state & EXIT_ZOMBIE) ? 'Z' : (p->exit_state & EXIT_DEAD) ? 'E' : (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; - if (p->pid == 0) { + if (is_idle_task(p)) { /* Idle task. Is it really idle, apart from the kdb * interrupt? */ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c3..22d901f9caf 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o ring_buffer.o +obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 00000000000..057e24b665c --- /dev/null +++ b/kernel/events/callchain.c @@ -0,0 +1,191 @@ +/* + * Performance events callchain code, extracted from core.c: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/slab.h> +#include "internal.h" + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +static struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f9..890eb02c2f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key perf_sched_events __read_mostly; +struct jump_label_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -1127,6 +1130,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -1322,6 +1327,7 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, @@ -1403,6 +1409,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; @@ -1659,8 +1667,7 @@ retry: * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_event_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) +static void __perf_event_mark_enabled(struct perf_event *event) { struct perf_event *sub; u64 tstamp = perf_event_time(event); @@ -1698,7 +1705,7 @@ static int __perf_event_enable(void *info) */ perf_cgroup_set_timestamp(current, ctx); - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); if (!event_filter_match(event)) { if (is_cgroup_event(event)) @@ -1779,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) retry: if (!ctx->is_active) { - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); goto out; } @@ -1806,6 +1813,7 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_enable); int perf_event_refresh(struct perf_event *event, int refresh) { @@ -2171,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + if (ctx->nr_events) + cpuctx->task_ctx = ctx; - cpuctx->task_ctx = ctx; + perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); @@ -2323,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; + if (!ctx->nr_freq) + return; + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2378,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0, remove = 1, freq = 0; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; + if (cpuctx->ctx.nr_freq) + freq = 1; } |