From db66d756c74acb886c51f11b501c2fe622018a0a Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 18 Apr 2014 01:59:15 +0900 Subject: sched/docbook: Fix 'make htmldocs' warnings caused by missing description When 'flags' argument to sched_{set,get}attr() syscalls were added in: 6d35ab48090b ("sched: Add 'flags' argument to sched_{set,get}attr() syscalls") no description for 'flags' was added. It causes the following warnings on "make htmldocs": Warning(/kernel/sched/core.c:3645): No description found for parameter 'flags' Warning(/kernel/sched/core.c:3789): No description found for parameter 'flags' Signed-off-by: Masanari Iida Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1397753955-2914-1-git-send-email-standby24x7@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238..9fe2190005c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3639,6 +3639,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. + * @flags: for future extension. */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) @@ -3783,6 +3784,7 @@ err_size: * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, size, unsigned int, flags) -- cgit v1.2.3-70-g09d2 From 46ce0fe97a6be7532ce6126bb26ce89fed81528c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 May 2014 16:56:01 +0200 Subject: perf: Fix race in removing an event When removing a (sibling) event we do: raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); raw_spin_unlock_irq(&ctx->lock); perf_remove_from_context(event); raw_spin_lock_irq(&ctx->lock); ... raw_spin_unlock_irq(&ctx->lock); Now, assuming the event is a sibling, it will be 'unreachable' for things like ctx_sched_out() because that iterates the groups->siblings, and we just unhooked the sibling. So, if during we get ctx_sched_out(), it will miss the event and not call event_sched_out() on it, leaving it programmed on the PMU. The subsequent perf_remove_from_context() call will find the ctx is inactive and only call list_del_event() to remove the event from all other lists. Hereafter we can proceed to free the event; while still programmed! Close this hole by moving perf_group_detach() inside the same ctx->lock region(s) perf_remove_from_context() has. The condition on inherited events only in __perf_event_exit_task() is likely complete crap because non-inherited events are part of groups too and we're tearing down just the same. But leave that for another patch. Most-likely-Fixes: e03a9a55b4e ("perf: Change close() semantics for group events") Reported-by: Vince Weaver Tested-by: Vince Weaver Much-staring-at-traces-by: Vince Weaver Much-staring-at-traces-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140505093124.GN17778@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46..ea899e2b559 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1443,6 +1443,11 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct remove_event { + struct perf_event *event; + bool detach_group; +}; + /* * Cross CPU call to remove a performance event * @@ -1451,12 +1456,15 @@ group_sched_out(struct perf_event *group_event, */ static int __perf_remove_from_context(void *info) { - struct perf_event *event = info; + struct remove_event *re = info; + struct perf_event *event = re->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); + if (re->detach_group) + perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && cpuctx->task_ctx == ctx) { ctx->is_active = 0; @@ -1481,10 +1489,14 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct remove_event re = { + .event = event, + .detach_group = detach_group, + }; lockdep_assert_held(&ctx->mutex); @@ -1493,12 +1505,12 @@ static void perf_remove_from_context(struct perf_event *event) * Per cpu events are removed via an smp call and * the removal is always successful. */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); + cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; } retry: - if (!task_function_call(task, __perf_remove_from_context, event)) + if (!task_function_call(task, __perf_remove_from_context, &re)) return; raw_spin_lock_irq(&ctx->lock); @@ -1515,6 +1527,8 @@ retry: * Since the task isn't running, its safe to remove the event, us * holding the ctx->lock ensures the task won't get scheduled in. */ + if (detach_group) + perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -3281,10 +3295,7 @@ int perf_event_release_kernel(struct perf_event *event) * to trigger the AB-BA case. */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); + perf_remove_from_context(event, true); mutex_unlock(&ctx->mutex); free_event(event); @@ -7165,7 +7176,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); + perf_remove_from_context(group_leader, false); /* * Removing from the context ends up with disabled @@ -7175,7 +7186,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling); + perf_remove_from_context(sibling, false); perf_event__state_init(sibling); put_ctx(gctx); } @@ -7305,7 +7316,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock(&src_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event); + perf_remove_from_context(event, false); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -7367,13 +7378,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); + perf_remove_from_context(child_event, !!child_event->parent); /* * It can happen that the parent exits first, and has events @@ -7857,14 +7862,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { + struct remove_event re = { .detach_group = false }; struct perf_event_context *ctx = __info; - struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); rcu_read_lock(); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event); + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) + __perf_remove_from_context(&re); rcu_read_unlock(); } -- cgit v1.2.3-70-g09d2 From ffb4ef21ac4308c2e738e6f83b6741bbc9b4fa3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 19:12:20 +0200 Subject: perf: Fix perf_event_init_context() perf_pin_task_context() can return NULL but perf_event_init_context() assumes it will not, correct this. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20140505171428.GU26782@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea899e2b559..71232844f23 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7729,6 +7729,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) * swapped under us. */ parent_ctx = perf_pin_task_context(parent, ctxn); + if (!parent_ctx) + return 0; /* * No need to check if parent_ctx != NULL here; since we saw -- cgit v1.2.3-70-g09d2 From 2d513868e2a33e1d5315490ef4c861ee65babd65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2014 23:26:24 +0200 Subject: sched: Sanitize irq accounting madness Russell reported, that irqtime_account_idle_ticks() takes ages due to: for (i = 0; i < ticks; i++) irqtime_account_process_tick(current, 0, rq); It's sad, that this code was written way _AFTER_ the NOHZ idle functionality was available. I charge myself guitly for not paying attention when that crap got merged with commit abb74cefa ("sched: Export ns irqtimes through /proc/stat") So instead of looping nr_ticks times just apply the whole thing at once. As a side note: The whole cputime_t vs. u64 business in that context wants to be cleaned up as well. There is no point in having all these back and forth conversions. Lets standardise on u64 nsec for all kernel internal accounting and be done with it. Everything else does not make sense at all for fine grained accounting. Frederic, can you please take care of that? Reported-by: Russell King Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Cc: Venkatesh Pallipadi Cc: Shaun Ruffell Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1405022307000.6261@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097cb459..72fdf06ef86 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -332,50 +332,50 @@ out: * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) + struct rq *rq, int ticks) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; + cputime *= ticks; + scaled *= ticks; + if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_IRQ] += cputime; } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_SOFTIRQ] += cputime; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_guest_time(p, cputime, scaled); } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); } } static void irqtime_account_idle_ticks(int ticks) { - int i; struct rq *rq = this_rq(); - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); + irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} + struct rq *rq, int nr_ticks) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); + irqtime_account_process_tick(p, user_tick, rq, 1); return; } -- cgit v1.2.3-70-g09d2 From 5bfd126e80dca70431aef8fdbc1cf14535f3c338 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 15 Apr 2014 13:49:04 +0200 Subject: sched/deadline: Fix sched_yield() behavior yield_task_dl() is broken: o it forces current to be throttled setting its runtime to zero; o it sets current's dl_se->dl_new to one, expecting that dl_task_timer() will queue it back with proper parameters at replenish time. Unfortunately, dl_task_timer() has this check at the very beginning: if (!dl_task(p) || dl_se->dl_new) goto unlock; So, it just bails out and the task is never replenished. It actually yielded forever. To fix this, introduce a new flag indicating that the task properly yielded the CPU before its current runtime expired. While this is a little overdoing at the moment, the flag would be useful in the future to discriminate between "good" jobs (of which remaining runtime could be reclaimed, i.e. recycled) and "bad" jobs (for which dl_throttled task has been set) that needed to be stopped. Reported-by: yjay.kim Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140429103953.e68eba1b2ac3309214e3dc5a@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++-- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25f54c79f75..2a4298fb0d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1153,9 +1153,12 @@ struct sched_dl_entity { * * @dl_boosted tells if we are boosted due to DI. If so we are * outside bandwidth enforcement mechanism (but only until we - * exit the critical section). + * exit the critical section); + * + * @dl_yielded tells if task gave up the cpu before consuming + * all its available runtime during the last job. */ - int dl_throttled, dl_new, dl_boosted; + int dl_throttled, dl_new, dl_boosted, dl_yielded; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fe2190005c..e62c65a12d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3124,6 +3124,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } static void __setscheduler_params(struct task_struct *p, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b08095786cb..800e99b9907 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) @@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq) * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it - * new scheduling parameters (thanks to dl_new=1). + * new scheduling parameters (thanks to dl_yielded=1). */ if (p->dl.runtime > 0) { - rq->curr->dl.dl_new = 1; + rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } update_curr_dl(rq); -- cgit v1.2.3-70-g09d2 From 6a7cd273dc4bc3246f37ebe874754a54ccb29141 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 17 Apr 2014 10:05:02 +0800 Subject: sched/deadline: Fix memory leak Free cpudl->free_cpus allocated in cpudl_init(). Signed-off-by: Li Zefan Acked-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: # 3.14+ Link: http://lkml.kernel.org/r/534F36CE.2000409@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d4..ab001b5d504 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp) */ void cpudl_cleanup(struct cpudl *cp) { - /* - * nothing to do for the moment - */ + free_cpumask_var(cp->free_cpus); } -- cgit v1.2.3-70-g09d2 From 6227cb00cc120f9a43ce8313bb0475ddabcb7d01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sun, 13 Apr 2014 09:34:53 -0400 Subject: sched: Use CPUPRI_NR_PRIORITIES instead of MAX_RT_PRIO in cpupri check The check at the beginning of cpupri_find() makes sure that the task_pri variable does not exceed the cp->pri_to_cpu array length. But that length is CPUPRI_NR_PRIORITIES not MAX_RT_PRIO, where it will miss the last two priorities in that array. As task_pri is computed from convert_prio() which should never be bigger than CPUPRI_NR_PRIORITIES, if the check should cause a panic if it is hit. Reported-by: Mike Galbraith Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1397015410.5212.13.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d9..3031bac8aa3 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -70,8 +70,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; -- cgit v1.2.3-70-g09d2 From 6ccdc84b81a0a6c09a7f0427761d2f8cecfc2218 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Apr 2014 12:00:47 +0200 Subject: sched: Skip double execution of pick_next_task_fair() Tim wrote: "The current code will call pick_next_task_fair a second time in the slow path if we did not pull any task in our first try. This is really unnecessary as we already know no task can be pulled and it doubles the delay for the cpu to enter idle. We instrumented some network workloads and that saw that pick_next_task_fair is frequently called twice before a cpu enters idle. The call to pick_next_task_fair can add non trivial latency as it calls load_balance which runs find_busiest_group on an hierarchy of sched domains spanning the cpus for a large system. For some 4 socket systems, we saw almost 0.25 msec spent per call of pick_next_task_fair before a cpu can be idled." Optimize the second call away for the common case and document the dependency. Reported-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Len Brown Link: http://lkml.kernel.org/r/20140424100047.GP11096@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e62c65a12d5..28921ec91b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2592,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev) if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); - if (likely(p && p != RETRY_TASK)) - return p; + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } again: -- cgit v1.2.3-70-g09d2 From 0e5b5337f0da073e1f17aec3c322ea7826975d0d Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 28 Apr 2014 15:45:54 -0700 Subject: sched: Fix updating rq->max_idle_balance_cost and rq->next_balance in idle_balance() The following commit: e5fc66119ec9 ("sched: Fix race in idle_balance()") can potentially cause rq->max_idle_balance_cost to not be updated, even when load_balance(NEWLY_IDLE) is attempted and the per-sd max cost value is updated. Preeti noticed a similar issue with updating rq->next_balance. In this patch, we fix this by making sure we still check/update those values even if a task gets enqueued while browsing the domains. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1398725155-7591-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c2..0fdb96de81a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6653,6 +6653,7 @@ static int idle_balance(struct rq *this_rq) int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -6705,14 +6706,16 @@ static int idle_balance(struct rq *this_rq) raw_spin_lock(&this_rq->lock); + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + /* - * While browsing the domains, we released the rq lock. - * A task could have be enqueued in the meantime + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - goto out; - } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -6722,9 +6725,6 @@ static int idle_balance(struct rq *this_rq) this_rq->next_balance = next_balance; } - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running && -- cgit v1.2.3-70-g09d2 From 2b4cfe64dee0d84506b951d81bf55d9891744d25 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:34 -0700 Subject: sched/numa: Initialize newidle balance stats in sd_numa_init() Also initialize the per-sd variables for newidle load balancing in sd_numa_init(). Signed-off-by: Jason Low Acked-by: morten.rasmussen@arm.com Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: aswin@hp.com Cc: chegu_vinod@hp.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398303035-18255-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28921ec91b3..13584f1cccf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6026,6 +6026,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) , .last_balance = jiffies, .balance_interval = sd_weight, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, }; SD_INIT_NAME(sd, NUMA); sd->private = &tl->data; -- cgit v1.2.3-70-g09d2 From 5024ae29cd285ce9e736776414da645d3a91828c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 May 2014 21:31:17 -0400 Subject: cgroup: introduce task_css_is_root() Determining the css of a task usually requires RCU read lock as that's the only thing which keeps the returned css accessible till its reference is acquired; however, testing whether a task belongs to the root can be performed without dereferencing the returned css by comparing the returned pointer against the root one in init_css_set[] which never changes. Implement task_css_is_root() which can be invoked in any context. This will be used by the scheduled cgroup_freezer change. v2: cgroup no longer supports modular controllers. No need to export init_css_set. Pointed out by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 15 +++++++++++++++ kernel/cgroup.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c2515851c1a..d60904b9e50 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -473,6 +473,7 @@ struct cftype { }; extern struct cgroup_root cgrp_dfl_root; +extern struct css_set init_css_set; static inline bool cgroup_on_dfl(const struct cgroup *cgrp) { @@ -700,6 +701,20 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, return task_css_check(task, subsys_id, false); } +/** + * task_css_is_root - test whether a task belongs to the root css + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Test whether @task belongs to the root css on the specified subsystem. + * May be invoked in any context. + */ +static inline bool task_css_is_root(struct task_struct *task, int subsys_id) +{ + return task_css_check(task, subsys_id, true) == + init_css_set.subsys[subsys_id]; +} + static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635..3f1ca934a23 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -348,7 +348,7 @@ struct cgrp_cset_link { * reference-counted, to improve performance when child cgroups * haven't been created. */ -static struct css_set init_css_set = { +struct css_set init_css_set = { .refcount = ATOMIC_INIT(1), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), -- cgit v1.2.3-70-g09d2 From e5ced8ebb10c20a3b349bd798b69ccabd3b25d21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 May 2014 21:31:17 -0400 Subject: cgroup_freezer: replace freezer->lock with freezer_mutex After 96d365e0b86e ("cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem"), css task iterators requires sleepable context as it may block on css_set_rwsem. I missed that cgroup_freezer was iterating tasks under IRQ-safe spinlock freezer->lock. This leads to errors like the following on freezer state reads and transitions. BUG: sleeping function called from invalid context at /work /os/work/kernel/locking/rwsem.c:20 in_atomic(): 0, irqs_disabled(): 0, pid: 462, name: bash 5 locks held by bash/462: #0: (sb_writers#7){.+.+.+}, at: [] vfs_write+0x1a3/0x1c0 #1: (&of->mutex){+.+.+.}, at: [] kernfs_fop_write+0xbb/0x170 #2: (s_active#70){.+.+.+}, at: [] kernfs_fop_write+0xc3/0x170 #3: (freezer_mutex){+.+...}, at: [] freezer_write+0x61/0x1e0 #4: (rcu_read_lock){......}, at: [] freezer_write+0x53/0x1e0 Preemption disabled at:[] console_unlock+0x1e4/0x460 CPU: 3 PID: 462 Comm: bash Not tainted 3.15.0-rc1-work+ #10 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 ffff88000916a6d0 ffff88000e0a3da0 ffffffff81cf8c96 0000000000000000 ffff88000e0a3dc8 ffffffff810cf4f2 ffffffff82388040 ffff880013aaf740 0000000000000002 ffff88000e0a3de8 ffffffff81d05974 0000000000000246 Call Trace: [] dump_stack+0x4e/0x7a [] __might_sleep+0x162/0x260 [] down_read+0x24/0x60 [] css_task_iter_start+0x27/0x70 [] freezer_apply_state+0x5d/0x130 [] freezer_write+0xf6/0x1e0 [] cgroup_file_write+0xd8/0x230 [] kernfs_fop_write+0xe7/0x170 [] vfs_write+0xb6/0x1c0 [] SyS_write+0x4d/0xc0 [] system_call_fastpath+0x16/0x1b freezer->lock used to be used in hot paths but that time is long gone and there's no reason for the lock to be IRQ-safe spinlock or even per-cgroup. In fact, given the fact that a cgroup may contain large number of tasks, it's not a good idea to iterate over them while holding IRQ-safe spinlock. Let's simplify locking by replacing per-cgroup freezer->lock with global freezer_mutex. This also makes the comments explaining the intricacies of policy inheritance and the locking around it as the states are protected by a common mutex. The conversion is mostly straight-forward. The followings are worth mentioning. * freezer_css_online() no longer needs double locking. * freezer_attach() now performs propagation simply while holding freezer_mutex. update_if_frozen() race no longer exists and the comment is removed. * freezer_fork() now tests whether the task is in root cgroup using the new task_css_is_root() without doing rcu_read_lock/unlock(). If not, it grabs freezer_mutex and performs the operation. * freezer_read() and freezer_change_state() grab freezer_mutex across the whole operation and pin the css while iterating so that each descendant processing happens in sleepable context. Fixes: 96d365e0b86e ("cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem") Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup_freezer.c | 112 ++++++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a225644..12ead0b766e 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -42,9 +43,10 @@ enum freezer_state_flags { struct freezer { struct cgroup_subsys_state css; unsigned int state; - spinlock_t lock; }; +static DEFINE_MUTEX(freezer_mutex); + static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { return css ? container_of(css, struct freezer, css) : NULL; @@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) if (!freezer) return ERR_PTR(-ENOMEM); - spin_lock_init(&freezer->lock); return &freezer->css; } @@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); - /* - * The following double locking and freezing state inheritance - * guarantee that @cgroup can never escape ancestors' freezing - * states. See css_for_each_descendant_pre() for details. - */ - if (parent) - spin_lock_irq(&parent->lock); - spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); + mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; @@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) atomic_inc(&system_freezing_cnt); } - spin_unlock(&freezer->lock); - if (parent) - spin_unlock_irq(&parent->lock); - + mutex_unlock(&freezer_mutex); return 0; } @@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); freezer->state = 0; - spin_unlock_irq(&freezer->lock); + mutex_unlock(&freezer_mutex); } static void freezer_css_free(struct cgroup_subsys_state *css) @@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, struct task_struct *task; bool clear_frozen = false; - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); /* * Make the new tasks conform to the current state of @new_css. @@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, } } - spin_unlock_irq(&freezer->lock); - - /* - * Propagate FROZEN clearing upwards. We may race with - * update_if_frozen(), but as long as both work bottom-up, either - * update_if_frozen() sees child's FROZEN cleared or we clear the - * parent's FROZEN later. No parent w/ !FROZEN children can be - * left FROZEN. - */ + /* propagate FROZEN clearing upwards */ while (clear_frozen && (freezer = parent_freezer(freezer))) { - spin_lock_irq(&freezer->lock); freezer->state &= ~CGROUP_FROZEN; clear_frozen = freezer->state & CGROUP_FREEZING; - spin_unlock_irq(&freezer->lock); } + + mutex_unlock(&freezer_mutex); } /** @@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - rcu_read_lock(); - freezer = task_freezer(task); - /* * The root cgroup is non-freezable, so we can skip locking the * freezer. This is safe regardless of race with task migration. @@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task) * to do. If we lost and root is the new cgroup, noop is still the * right thing to do. */ - if (!parent_freezer(freezer)) - goto out; + if (task_css_is_root(task, freezer_cgrp_id)) + return; - /* - * Grab @freezer->lock and freeze @task after verifying @task still - * belongs to @freezer and it's freezing. The former is for the - * case where we have raced against task migration and lost and - * @task is already in a different cgroup which may not be frozen. - * This isn't strictly necessary as freeze_task() is allowed to be - * called spuriously but let's do it anyway for, if nothing else, - * documentation. - */ - spin_lock_irq(&freezer->lock); - if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) freeze_task(task); - spin_unlock_irq(&freezer->lock); -out: + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); } /** @@ -281,22 +255,22 @@ static void update_if_frozen(struct cgroup_subsys_state *css) struct css_task_iter it; struct task_struct *task; - WARN_ON_ONCE(!rcu_read_lock_held()); - - spin_lock_irq(&freezer->lock); + lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) - goto out_unlock; + return; /* are all (live) children frozen? */ + rcu_read_lock(); css_for_each_child(pos, css) { struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) - goto out_unlock; + return; } + rcu_read_unlock(); /* are all tasks frozen? */ css_task_iter_start(css, &it); @@ -317,21 +291,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: css_task_iter_end(&it); -out_unlock: - spin_unlock_irq(&freezer->lock); } static int freezer_read(struct seq_file *m, void *v) { struct cgroup_subsys_state *css = seq_css(m), *pos; + mutex_lock(&freezer_mutex); rcu_read_lock(); /* update states bottom-up */ - css_for_each_descendant_post(pos, css) + css_for_each_descendant_post(pos, css) { + if (!css_tryget(pos)) + continue; + rcu_read_unlock(); + update_if_frozen(pos); + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); @@ -373,7 +355,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer->lock); + lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZER_ONLINE)) return; @@ -414,31 +396,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * descendant will try to inherit its parent's FREEZING state as * CGROUP_FREEZING_PARENT. */ + mutex_lock(&freezer_mutex); rcu_read_lock(); css_for_each_descendant_pre(pos, &freezer->css) { struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - spin_lock_irq(&pos_f->lock); + if (!css_tryget(pos)) + continue; + rcu_read_unlock(); - if (pos_f == freezer) { + if (pos_f == freezer) freezer_apply_state(pos_f, freeze, CGROUP_FREEZING_SELF); - } else { - /* - * Our update to @parent->state is already visible - * which is all we need. No need to lock @parent. - * For more info on synchronization, see - * freezer_post_create(). - */ + else freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, CGROUP_FREEZING_PARENT); - } - spin_unlock_irq(&pos_f->lock); + rcu_read_lock(); + css_put(pos); } rcu_read_unlock(); + mutex_unlock(&freezer_mutex); } static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, -- cgit v1.2.3-70-g09d2 From 36e9d2ebcc15d029b33f42a36146ab5a5bcfcfe7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 11:28:30 -0400 Subject: cgroup: fix rcu_read_lock() leak in update_if_frozen() While updating cgroup_freezer locking, 68fafb77d827 ("cgroup_freezer: replace freezer->lock with freezer_mutex") introduced a bug in update_if_frozen() where it returns with rcu_read_lock() held. Fix it by adding rcu_read_unlock() before returning. Signed-off-by: Tejun Heo Reported-by: kbuild test robot --- kernel/cgroup_freezer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 12ead0b766e..345628c78b5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -267,8 +267,10 @@ static void update_if_frozen(struct cgroup_subsys_state *css) struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); return; + } } rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 866293ee54227584ffcb4a42f69c1f365974ba7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2014 20:45:34 +0000 Subject: futex: Add another early deadlock detection check Dave Jones trinity syscall fuzzer exposed an issue in the deadlock detection code of rtmutex: http://lkml.kernel.org/r/20140429151655.GA14277@redhat.com That underlying issue has been fixed with a patch to the rtmutex code, but the futex code must not call into rtmutex in that case because - it can detect that issue early - it avoids a different and more complex fixup for backing out If the user space variable got manipulated to 0x80000000 which means no lock holder, but the waiters bit set and an active pi_state in the kernel is found we can figure out the recursive locking issue by looking at the pi_state owner. If that is the current task, then we can safely return -EDEADLK. The check should have been added in commit 59fa62451 (futex: Handle futex_pi OWNER_DIED take over correctly) already, but I did not see the above issue caused by user space manipulation back then. Signed-off-by: Thomas Gleixner Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: Clark Williams Cc: Paul McKenney Cc: Lai Jiangshan Cc: Roland McGrath Cc: Carlos ODonell Cc: Jakub Jelinek Cc: Michael Kerrisk Cc: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20140512201701.097349971@linutronix.de Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/futex.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e46..7c68225e396 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -745,7 +745,8 @@ void exit_pi_state_list(struct task_struct *curr) static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + union futex_key *key, struct futex_pi_state **ps, + struct task_struct *task) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; @@ -786,6 +787,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; } + /* + * Protect against a corrupted uval. If uval + * is 0x80000000 then pid is 0 and the waiter + * bit is set. So the deadlock check in the + * calling code has failed and we did not fall + * into the check above due to !pid. + */ + if (task && pi_state->owner == task) + return -EDEADLK; + atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -935,7 +946,7 @@ retry: * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, key, ps); + ret = lookup_pi_state(uval, hb, key, ps, task); if (unlikely(ret)) { switch (ret) { @@ -1347,7 +1358,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * * Return: * 0 - failed to acquire the lock atomically; - * 1 - acquired the lock; + * >0 - acquired the lock, return value is vpid of the top_waiter * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1358,7 +1369,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, { struct futex_q *top_waiter = NULL; u32 curval; - int ret; + int ret, vpid; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -1386,11 +1397,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * the contended case or if set_waiters is 1. The pi_state is returned * in ps in contended cases. */ + vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); - if (ret == 1) + if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); - + return vpid; + } return ret; } @@ -1421,7 +1434,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - u32 curval2; if (requeue_pi) { /* @@ -1509,16 +1521,25 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. + * reference to it. If the lock was taken, ret contains the + * vpid of the top waiter task. */ - if (ret == 1) { + if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; - ret = get_futex_value_locked(&curval2, uaddr2); - if (!ret) - ret = lookup_pi_state(curval2, hb2, &key2, - &pi_state); + /* + * If we acquired the lock, then the user + * space value of uaddr2 should be vpid. It + * cannot be changed by the top waiter as it + * is blocked on hb2 lock if it tries to do + * so. If something fiddled with it behind our + * back the pi state lookup might unearth + * it. So we rather use the known value than + * rereading and handing potential crap to + * lookup_pi_state. + */ + ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); } switch (ret) { -- cgit v1.2.3-70-g09d2 From f0d71b3dcb8332f7971b5f2363632573e6d9486a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2014 20:45:35 +0000 Subject: futex: Prevent attaching to kernel threads We happily allow userspace to declare a random kernel thread to be the owner of a user space PI futex. Found while analysing the fallout of Dave Jones syscall fuzzer. We also should validate the thread group for private futexes and find some fast way to validate whether the "alleged" owner has RW access on the file which backs the SHM, but that's a separate issue. Signed-off-by: Thomas Gleixner Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: Clark Williams Cc: Paul McKenney Cc: Lai Jiangshan Cc: Roland McGrath Cc: Carlos ODonell Cc: Jakub Jelinek Cc: Michael Kerrisk Cc: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20140512201701.194824402@linutronix.de Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/futex.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7c68225e396..81dbe773ce4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -814,6 +814,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; + if (!p->mm) { + put_task_struct(p); + return -EPERM; + } + /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit -- cgit v1.2.3-70-g09d2 From 0819b2e30ccb93edf04876237b6205eef84ec8d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 May 2014 20:23:48 +0200 Subject: perf: Limit perf_event_attr::sample_period to 63 bits Vince reported that using a large sample_period (one with bit 63 set) results in wreckage since while the sample_period is fundamentally unsigned (negative periods don't make sense) the way we implement things very much rely on signed logic. So limit sample_period to 63 bits to avoid tripping over this. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/n/tip-p25fhunibl4y3qi0zuqmyf4b@git.kernel.org Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 71232844f23..1d1ec6453a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7029,6 +7029,9 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; + } else { + if (attr.sample_period & (1ULL << 63)) + return -EINVAL; } /* -- cgit v1.2.3-70-g09d2 From 39af6b1678afa5880dda7e375cf3f9d395087f6d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Apr 2014 11:04:08 +0200 Subject: perf: Prevent false warning in perf_swevent_add The perf cpu offline callback takes down all cpu context events and releases swhash->swevent_hlist. This could race with task context software event being just scheduled on this cpu via perf_swevent_add while cpu hotplug code already cleaned up event's data. The race happens in the gap between the cpu notifier code and the cpu being actually taken down. Note that only cpu ctx events are terminated in the perf cpu hotplug code. It's easily reproduced with: $ perf record -e faults perf bench sched pipe while putting one of the cpus offline: # echo 0 > /sys/devices/system/cpu/cpu1/online Console emits following warning: WARNING: CPU: 1 PID: 2845 at kernel/events/core.c:5672 perf_swevent_add+0x18d/0x1a0() Modules linked in: CPU: 1 PID: 2845 Comm: sched-pipe Tainted: G W 3.14.0+ #256 Hardware name: Intel Corporation Montevina platform/To be filled by O.E.M., BIOS AMVACRB1.86C.0066.B00.0805070703 05/07/2008 0000000000000009 ffff880077233ab8 ffffffff81665a23 0000000000200005 0000000000000000 ffff880077233af8 ffffffff8104732c 0000000000000046 ffff88007467c800 0000000000000002 ffff88007a9cf2a0 0000000000000001 Call Trace: [] dump_stack+0x4f/0x7c [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_null+0x1a/0x20 [] perf_swevent_add+0x18d/0x1a0 [] event_sched_in.isra.75+0x9e/0x1f0 [] group_sched_in+0x6a/0x1f0 [] ? sched_clock_local+0x25/0xa0 [] ctx_sched_in+0x1f6/0x450 [] perf_event_sched_in+0x6b/0xa0 [] perf_event_context_sched_in+0x7b/0xc0 [] __perf_event_task_sched_in+0x43e/0x460 [] ? put_lock_stats.isra.18+0xe/0x30 [] finish_task_switch+0xb8/0x100 [] __schedule+0x30e/0xad0 [] ? pipe_read+0x3e2/0x560 [] ? preempt_schedule_irq+0x3e/0x70 [] ? preempt_schedule_irq+0x3e/0x70 [] preempt_schedule_irq+0x44/0x70 [] retint_kernel+0x20/0x30 [] ? lockdep_sys_exit+0x1a/0x90 [] lockdep_sys_exit_thunk+0x35/0x67 [] ? sysret_check+0x5/0x56 Fixing this by tracking the cpu hotplug state and displaying the WARN only if current cpu is initialized properly. Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: stable@vger.kernel.org Reported-by: Fengguang Wu Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1396861448-10097-1-git-send-email-jolsa@redhat.com Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d1ec6453a0..feb1329ca33 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5419,6 +5419,9 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; + + /* Keeps track of cpu being initialized/exited */ + bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5665,8 +5668,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) + if (!head) { + /* + * We can race with cpu hotplug code. Do not + * WARN if the cpu just got unplugged. + */ + WARN_ON_ONCE(swhash->online); return -EINVAL; + } hlist_add_head_rcu(&event->hlist_entry, head); @@ -7845,6 +7854,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -7902,6 +7912,7 @@ static void perf_event_exit_cpu(int cpu) perf_event_exit_cpu_context(cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = false; swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } -- cgit v1.2.3-70-g09d2 From b69cf53640da2b86439596118cfa95233154ee76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2014 10:50:33 +0100 Subject: perf: Fix a race between ring_buffer_detach() and ring_buffer_attach() Alexander noticed that we use RCU iteration on rb->event_list but do not use list_{add,del}_rcu() to add,remove entries to that list, nor do we observe proper grace periods when re-using the entries. Merge ring_buffer_detach() into ring_buffer_attach() such that attaching to the NULL buffer is detaching. Furthermore, ensure that between any 'detach' and 'attach' of the same event we observe the required grace period, but only when strictly required. In effect this means that only ioctl(.request = PERF_EVENT_IOC_SET_OUTPUT) will wait for a grace period, while the normal initial attach and final detach will not be delayed. This patch should, I think, do the right thing under all circumstances, the 'normal' cases all should never see the extra grace period, but the two cases: 1) PERF_EVENT_IOC_SET_OUTPUT on an event which already has a ring_buffer set, will now observe the required grace period between removing itself from the old and attaching itself to the new buffer. This case is 'simple' in that both buffers are present in perf_event_set_output() one could think an unconditional synchronize_rcu() would be sufficient; however... 2) an event that has a buffer attached, the buffer is destroyed (munmap) and then the event is attached to a new/different buffer using PERF_EVENT_IOC_SET_OUTPUT. This case is more complex because the buffer destruction does: ring_buffer_attach(.rb = NULL) followed by the ioctl() doing: ring_buffer_attach(.rb = foo); and we still need to observe the grace period between these two calls due to us reusing the event->rb_entry list_head. In order to make 2 happen we use Paul's latest cond_synchronize_rcu() call. Cc: Paul Mackerras Cc: Stephane Eranian Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Mike Galbraith Reported-by: Alexander Shishkin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507123526.GD13658@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/perf_event.h | 2 + kernel/events/core.c | 109 ++++++++++++++++++++------------------------- 2 files changed, 51 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3356abcfff1..3ef6ea12806 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -402,6 +402,8 @@ struct perf_event { struct ring_buffer *rb; struct list_head rb_entry; + unsigned long rcu_batches; + int rcu_pending; /* poll related */ wait_queue_head_t waitq; diff --git a/kernel/events/core.c b/kernel/events/core.c index feb1329ca33..440eefc6739 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3192,7 +3192,8 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); static void unaccount_event_cpu(struct perf_event *event, int cpu) { @@ -3252,8 +3253,6 @@ static void free_event(struct perf_event *event) unaccount_event(event); if (event->rb) { - struct ring_buffer *rb; - /* * Can happen when we close an event with re-directed output. * @@ -3261,12 +3260,7 @@ static void free_event(struct perf_event *event) * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); - rb = event->rb; - if (rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* could be last */ - } + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } @@ -3850,28 +3844,47 @@ unlock: static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb) { + struct ring_buffer *old_rb = NULL; unsigned long flags; - if (!list_empty(&event->rb_entry)) - return; + if (event->rb) { + /* + * Should be impossible, we set this when removing + * event->rb_entry and wait/clear when adding event->rb_entry. + */ + WARN_ON_ONCE(event->rcu_pending); - spin_lock_irqsave(&rb->event_lock, flags); - if (list_empty(&event->rb_entry)) - list_add(&event->rb_entry, &rb->event_list); - spin_unlock_irqrestore(&rb->event_lock, flags); -} + old_rb = event->rb; + event->rcu_batches = get_state_synchronize_rcu(); + event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ - unsigned long flags; + spin_lock_irqsave(&old_rb->event_lock, flags); + list_del_rcu(&event->rb_entry); + spin_unlock_irqrestore(&old_rb->event_lock, flags); + } - if (list_empty(&event->rb_entry)) - return; + if (event->rcu_pending && rb) { + cond_synchronize_rcu(event->rcu_batches); + event->rcu_pending = 0; + } - spin_lock_irqsave(&rb->event_lock, flags); - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - spin_unlock_irqrestore(&rb->event_lock, flags); + if (rb) { + spin_lock_irqsave(&rb->event_lock, flags); + list_add_rcu(&event->rb_entry, &rb->event_list); + spin_unlock_irqrestore(&rb->event_lock, flags); + } + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } } static void ring_buffer_wakeup(struct perf_event *event) @@ -3940,7 +3953,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -3948,18 +3961,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) - return; + goto out_put; - /* Detach current event from the buffer. */ - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) { - ring_buffer_put(rb); /* can't be last */ - return; - } + if (atomic_read(&rb->mmap_count)) + goto out_put; /* * No other mmap()s, detach from all other events that might redirect @@ -3989,11 +3998,9 @@ again: * still restart the iteration to make sure we're not now * iterating the wrong list. */ - if (event->rb == rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* can't be last, we still have one */ - } + if (event->rb == rb) + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); put_event(event); @@ -4018,6 +4025,7 @@ again: vma->vm_mm->pinned_vm -= mmap_locked; free_uid(mmap_user); +out_put: ring_buffer_put(rb); /* could be last */ } @@ -4135,7 +4143,6 @@ again: vma->vm_mm->pinned_vm += extra; ring_buffer_attach(event, rb); - rcu_assign_pointer(event->rb, rb); perf_event_init_userpage(event); perf_event_update_userpage(event); @@ -6934,7 +6941,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL, *old_rb = NULL; + struct ring_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6962,8 +6969,6 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; - old_rb = event->rb; - if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6971,23 +6976,7 @@ set: goto unlock; } - if (old_rb) - ring_buffer_detach(event, old_rb); - - if (rb) - ring_buffer_attach(event, rb); - - rcu_assign_pointer(event->rb, rb); - - if (old_rb) { - ring_buffer_put(old_rb); - /* - * Since we detached before setting the new rb, so that we - * could attach the new rb, we could have missed a wakeup. - * Provide it now. - */ - wake_up_all(&event->waitq); - } + ring_buffer_attach(event, rb); ret = 0; unlock: -- cgit v1.2.3-70-g09d2 From 143cf23df25b7082cd706c3c53188e741e7881c3 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:15 +0200 Subject: sched: Make sched_setattr() correctly return -EFBIG The documented[1] behavior of sched_attr() in the proposed man page text is: sched_attr::size must be set to the size of the structure, as in sizeof(struct sched_attr), if the provided structure is smaller than the kernel structure, any additional fields are assumed '0'. If the provided structure is larger than the kernel structure, the kernel verifies all additional fields are '0' if not the syscall will fail with -E2BIG. As currently implemented, sched_copy_attr() returns -EFBIG for for this case, but the logic in sys_sched_setattr() converts that error to -EFAULT. This patch fixes the behavior. [1] http://thread.gmane.org/gmane.linux.kernel/1615615/focus=1697760 Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Cc: Cc: Linus Torvalds Link: http://lkml.kernel.org/r/536CEC17.9070903@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1cccf..f2205f02eb7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3658,8 +3658,9 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (!uattr || pid < 0 || flags) return -EINVAL; - if (sched_copy_attr(uattr, &attr)) - return -EFAULT; + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; rcu_read_lock(); retval = -ESRCH; -- cgit v1.2.3-70-g09d2 From dbdb22754fde671dc93d2fae06f8be113d47f2fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 May 2014 10:49:03 +0200 Subject: sched: Disallow sched_attr::sched_policy < 0 The scheduler uses policy=-1 to preserve the current policy state to implement sys_sched_setparam(), this got exposed to userspace by accident through sys_sched_setattr(), cure this. Reported-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140509085311.GJ30445@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f2205f02eb7..cdefcf7c592 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3662,6 +3662,9 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; + if (attr.sched_policy < 0) + return -EINVAL; + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); -- cgit v1.2.3-70-g09d2 From ce5f7f8200ca2504f6f290044393d73ca314965a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2014 22:50:34 +0200 Subject: sched/deadline: Change sched_getparam() behaviour vs SCHED_DEADLINE The way we read POSIX one should only call sched_getparam() when sched_getscheduler() returns either SCHED_FIFO or SCHED_RR. Given that we currently return sched_param::sched_priority=0 for all others, extend the same behaviour to SCHED_DEADLINE. Requested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Dario Faggioli Cc: linux-man Cc: "Michael Kerrisk (man-pages)" Cc: Juri Lelli Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/20140512205034.GH13467@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cdefcf7c592..f3f08bf9435 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3713,7 +3713,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -3730,11 +3730,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_dl_policy(p)) { - retval = -EINVAL; - goto out_unlock; - } - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* -- cgit v1.2.3-70-g09d2 From b0827819b0da4acfbc1df1e05edcf50efd07cbd1 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 13 May 2014 14:11:31 +0200 Subject: sched/deadline: Restrict user params max value to 2^63 ns Michael Kerrisk noticed that creating SCHED_DEADLINE reservations with certain parameters (e.g, a runtime of something near 2^64 ns) can cause a system freeze for some amount of time. The problem is that in the interface we have u64 sched_runtime; while internally we need to have a signed runtime (to cope with budget overruns) s64 runtime; At the time we setup a new dl_entity we copy the first value in the second. The cast turns out with negative values when sched_runtime is too big, and this causes the scheduler to go crazy right from the start. Moreover, considering how we deal with deadlines wraparound (s64)(a - b) < 0 we also have to restrict acceptable values for sched_{deadline,period}. This patch fixes the thing checking that user parameters are always below 2^63 ns (still large enough for everyone). It also rewrites other conditions that we check, since in __checkparam_dl we don't have to deal with deadline wraparounds and what we have now erroneously fails when the difference between values is too big. Reported-by: Michael Kerrisk Suggested-by: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Cc: Dario Faggioli Cc: Dave Jones Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140513141131.20d944f81633ee937f256385@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3f08bf9435..44e00abece0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3195,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ static bool __checkparam_dl(const struct sched_attr *attr) { - return attr && attr->sched_deadline != 0 && - (attr->sched_period == 0 || - (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && - attr->sched_runtime >= (2 << (DL_SCALE - 1)); + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* -- cgit v1.2.3-70-g09d2 From 944770ab54babaef29d9d1dc8189898b3ee8afcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 May 2014 16:13:56 +0200 Subject: sched/deadline: Replace NR_CPUS arrays Tejun reported that his resume was failing due to order-3 allocations from sched_domain building. Replace the NR_CPUS arrays in there with a dynamically allocated array. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra Acked-by: Juri Lelli Cc: Johannes Weiner Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kat4gl1m5a6dwy6nzuqox45e@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 33 ++++++++++++++++++++++++--------- kernel/sched/cpudeadline.h | 6 +++--- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ab001b5d504..bd95963dae8 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@ #include #include +#include #include "cpudeadline.h" static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - swap(cp->elements[a], cp->elements[b]); - swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } static void cpudl_heapify(struct cpudl *cp, int idx) @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); - old_idx = cp->cpu_to_idx[cpu]; + old_idx = cp->elements[cpu].idx; if (!is_valid) { /* remove item */ if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; - cp->cpu_to_idx[new_cpu] = old_idx; - cp->cpu_to_idx[cpu] = IDX_INVALID; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; while (old_idx > 0 && dl_time_before( cp->elements[parent(old_idx)].dl, cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size++; cp->elements[cp->size - 1].dl = 0; cp->elements[cp->size - 1].cpu = cpu; - cp->cpu_to_idx[cpu] = cp->size - 1; + cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; - for (i = 0; i < NR_CPUS; i++) - cp->cpu_to_idx[i] = IDX_INVALID; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + cpumask_setall(cp->free_cpus); return 0; @@ -211,4 +225,5 @@ int cpudl_init(struct cpudl *cp) void cpudl_cleanup(struct cpudl *cp) { free_cpumask_var(cp->free_cpus); + kfree(cp->elements); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412..538c9796ad4 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@ #define IDX_INVALID -1 -struct array_item { +struct cpudl_item { u64 dl; int cpu; + int idx; }; struct cpudl { raw_spinlock_t lock; int size; - int cpu_to_idx[NR_CPUS]; - struct array_item elements[NR_CPUS]; cpumask_var_t free_cpus; + struct cpudl_item *elements; }; -- cgit v1.2.3-70-g09d2 From 4dac0b638310d2e92f6e19958b73d4c97c9734bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 May 2014 16:04:26 +0200 Subject: sched/cpupri: Replace NR_CPUS arrays Tejun reported that his resume was failing due to order-3 allocations from sched_domain building. Replace the NR_CPUS arrays in there with a dynamically allocated array. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra Cc: Johannes Weiner Cc: Steven Rostedt Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-7cysnkw1gik45r864t1nkudh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 7 +++++++ kernel/sched/cpupri.h | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 3031bac8aa3..8834243abee 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -218,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -236,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d75617349..6b033347fdf 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP -- cgit v1.2.3-70-g09d2 From 6acbfb96976fc3350e30d964acb1dbbdf876d55e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 16 May 2014 11:50:42 +0800 Subject: sched: Fix hotplug vs. set_cpus_allowed_ptr() Lai found that: WARNING: CPU: 1 PID: 13 at arch/x86/kernel/smp.c:124 native_smp_send_reschedule+0x2d/0x4b() ... migration_cpu_stop+0x1d/0x22 was caused by set_cpus_allowed_ptr() assuming that cpu_active_mask is always a sub-set of cpu_online_mask. This isn't true since 5fbd036b552f ("sched: Cleanup cpu_active madness"). So set active and online at the same time to avoid this particular problem. Fixes: 5fbd036b552f ("sched: Cleanup cpu_active madness") Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Gautham R. Shenoy Cc: Linus Torvalds Cc: Michael wang Cc: Paul Gortmaker Cc: Rafael J. Wysocki Cc: Srivatsa S. Bhat Cc: Toshi Kani Link: http://lkml.kernel.org/r/53758B12.8060609@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 6 ++++-- kernel/sched/core.c | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e..247979a1b81 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present) void set_cpu_online(unsigned int cpu, bool online) { - if (online) + if (online) { cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - else + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + } else { cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); + } } void set_cpu_active(unsigned int cpu, bool active) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44e00abece0..86f3890c3d0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5076,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3-70-g09d2 From 011e4b02f1da156ac7fea28a9da878f3c23af739 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 27 May 2014 16:25:34 +0530 Subject: powerpc, kexec: Fix "Processor X is stuck" issue during kexec from ST mode If we try to perform a kexec when the machine is in ST (Single-Threaded) mode (ppc64_cpu --smt=off), the kexec operation doesn't succeed properly, and we get the following messages during boot: [ 0.089866] POWER8 performance monitor hardware support registered [ 0.089985] power8-pmu: PMAO restore workaround active. [ 5.095419] Processor 1 is stuck. [ 10.097933] Processor 2 is stuck. [ 15.100480] Processor 3 is stuck. [ 20.102982] Processor 4 is stuck. [ 25.105489] Processor 5 is stuck. [ 30.108005] Processor 6 is stuck. [ 35.110518] Processor 7 is stuck. [ 40.113369] Processor 9 is stuck. [ 45.115879] Processor 10 is stuck. [ 50.118389] Processor 11 is stuck. [ 55.120904] Processor 12 is stuck. [ 60.123425] Processor 13 is stuck. [ 65.125970] Processor 14 is stuck. [ 70.128495] Processor 15 is stuck. [ 75.131316] Processor 17 is stuck. Note that only the sibling threads are stuck, while the primary threads (0, 8, 16 etc) boot just fine. Looking closer at the previous step of kexec, we observe that kexec tries to wakeup (bring online) the sibling threads of all the cores, before performing kexec: [ 9464.131231] Starting new kernel [ 9464.148507] kexec: Waking offline cpu 1. [ 9464.148552] kexec: Waking offline cpu 2. [ 9464.148600] kexec: Waking offline cpu 3. [ 9464.148636] kexec: Waking offline cpu 4. [ 9464.148671] kexec: Waking offline cpu 5. [ 9464.148708] kexec: Waking offline cpu 6. [ 9464.148743] kexec: Waking offline cpu 7. [ 9464.148779] kexec: Waking offline cpu 9. [ 9464.148815] kexec: Waking offline cpu 10. [ 9464.148851] kexec: Waking offline cpu 11. [ 9464.148887] kexec: Waking offline cpu 12. [ 9464.148922] kexec: Waking offline cpu 13. [ 9464.148958] kexec: Waking offline cpu 14. [ 9464.148994] kexec: Waking offline cpu 15. [ 9464.149030] kexec: Waking offline cpu 17. Instrumenting this piece of code revealed that the cpu_up() operation actually fails with -EBUSY. Thus, only the primary threads of all the cores are online during kexec, and hence this is a sure-shot receipe for disaster, as explained in commit e8e5c2155b (powerpc/kexec: Fix orphaned offline CPUs across kexec), as well as in the comment above wake_offline_cpus(). It turns out that cpu_up() was returning -EBUSY because the variable 'cpu_hotplug_disabled' was set to 1; and this disabling of CPU hotplug was done by migrate_to_reboot_cpu() inside kernel_kexec(). Now, migrate_to_reboot_cpu() was originally written with the assumption that any further code will not need to perform CPU hotplug, since we are anyway in the reboot path. However, kexec is clearly not such a case, since we depend on onlining CPUs, atleast on powerpc. So re-enable cpu-hotplug after returning from migrate_to_reboot_cpu() in the kexec path, to fix this regression in kexec on powerpc. Also, wrap the cpu_up() in powerpc kexec code within a WARN_ON(), so that we can catch such issues more easily in the future. Fixes: c97102ba963 (kexec: migrate to reboot cpu) Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/machine_kexec_64.c | 2 +- kernel/kexec.c | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 59d229a2a3e..879b3aacac3 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -237,7 +237,7 @@ static void wake_offline_cpus(void) if (!cpu_online(cpu)) { printk(KERN_INFO "kexec: Waking offline cpu %d.\n", cpu); - cpu_up(cpu); + WARN_ON(cpu_up(cpu)); } } } diff --git a/kernel/kexec.c b/kernel/kexec.c index c8380ad203b..28c57069ef6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1683,6 +1683,14 @@ int kernel_kexec(void) kexec_in_progress = true; kernel_restart_prepare(NULL); migrate_to_reboot_cpu(); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } -- cgit v1.2.3-70-g09d2 From 397335f004f41e5fcf7a795e94eb3ab83411a17c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 May 2014 03:25:39 +0000 Subject: rtmutex: Fix deadlock detector for real The current deadlock detection logic does not work reliably due to the following early exit path: /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! */ if (top_waiter && (!task_has_pi_waiters(task) || top_waiter != task_top_pi_waiter(task))) goto out_unlock_pi; So this not only exits when the task has no waiters, it also exits unconditionally when the current waiter is not the top priority waiter of the task. So in a nested locking scenario, it might abort the lock chain walk and therefor miss a potential deadlock. Simple fix: Continue the chain walk, when deadlock detection is enabled. We also avoid the whole enqueue, if we detect the deadlock right away (A-A). It's an optimization, but also prevents that another waiter who comes in after the detection and before the task has undone the damage observes the situation and detects the deadlock and returns -EDEADLOCK, which is wrong as the other task is not in a deadlock situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Steven Rostedt Cc: Lai Jiangshan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20140522031949.725272460@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b59..a620d4d08ca 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * top_waiter can be NULL, when we are in the deboosting * mode! */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; + if (top_waiter) { + if (!task_has_pi_waiters(task)) + goto out_unlock_pi; + /* + * If deadlock detection is off, we stop here if we + * are not the top pi waiter of the task. + */ + if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) + goto out_unlock_pi; + } /* * When deadlock detection is off then we check, if further @@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto retry; } - /* Deadlock detection */ + /* + * Deadlock detection. If the lock is the same as the original + * lock which caused us to walk the lock chain or if the + * current lock is owned by the task which initiated the chain + * walk, we detected a deadlock. + */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); @@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; + /* + * Early deadlock detection. We really don't want the task to + * enqueue on itself just to untangle the mess later. It's not + * only an optimization. We drop the locks, so another waiter + * can come in before the chain walk detects the deadlock. So + * the other will detect the deadlock and return -EDEADLOCK, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ + if (detect_deadlock && owner == task) + return -EDEADLK; + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; -- cgit v1.2.3-70-g09d2