diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 8101 |
1 files changed, 8101 insertions, 0 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c new file mode 100644 index 00000000000..ca8fd44145a --- /dev/null +++ b/kernel/sched/core.c @@ -0,0 +1,8101 @@ +/* + * kernel/sched/core.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/debug_locks.h> +#include <linux/perf_event.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/freezer.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/pid_namespace.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/tsacct_kern.h> +#include <linux/kprobes.h> +#include <linux/delayacct.h> +#include <linux/unistd.h> +#include <linux/pagemap.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/init_task.h> + +#include <asm/tlb.h> +#include <asm/irq_regs.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif + +#include "sched.h" +#include "../workqueue_sched.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> + +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +{ + unsigned long delta; + ktime_t soft, hard, now; + + for (;;) { + if (hrtimer_active(period_timer)) + break; + + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); + + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } +} + +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +void update_rq_clock(struct rq *rq) +{ + s64 delta; + + if (rq->skip_clock_update > 0) + return; + + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + rq->clock += delta; + update_rq_clock_task(rq, delta); +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; sched_feat_names[i]; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +__read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + + + +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + raw_spin_unlock(&rq->lock); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } +} + +static void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ + struct rq *rq = arg; + + raw_spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + raw_spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + + hrtimer_set_expires(timer, time); + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + rq->hrtick_csd_pending = 1; + } +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hrtick_clear(cpu_rq(cpu)); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ + __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; + + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; +} +#else /* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +void resched_task(struct task_struct *p) +{ + int cpu; + + assert_raw_spin_locked(&task_rq(p)->lock); + + if (test_tsk_need_resched(p)) + return; + + set_tsk_need_resched(p); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + +void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } + } +unlock: + rcu_read_unlock(); + return cpu; +} +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_need_resched(rq->idle); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} + +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + +#endif /* CONFIG_NO_HZ */ + +void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +#else /* !CONFIG_SMP */ +void resched_task(struct task_struct *p) +{ + assert_raw_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +/* + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) +{ + struct task_group *parent, *child; + int ret; + + parent = from; + +down: + ret = (*down)(parent, data); + if (ret) + goto out; + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + ret = (*up)(parent, data); + if (ret || parent == from) + goto out; + + child = parent; + parent = parent->parent; + if (parent) + goto up; +out: + return ret; +} + +int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +void update_cpu_load(struct rq *this_rq); + +static void set_load_weight(struct task_struct *p) +{ + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + load->weight = scale_load(WEIGHT_IDLEPRIO); + load->inv_weight = WMULT_IDLEPRIO; + return; + } + + load->weight = scale_load(prio_to_weight[prio]); + load->inv_weight = prio_to_wmult[prio]; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +{ + update_rq_clock(rq); + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, flags); +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ + update_rq_clock(rq); + sched_info_dequeued(p); + p->sched_class->dequeue_task(rq, p, flags); +} + +/* + * activate_task - move a task to the runqueue. + */ +void activate_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, flags); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, flags); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_branch((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + + rq->clock_task += delta; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + int prio; + + if (task_has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); +} + +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ + const struct sched_class *class; + + if (p->sched_class == rq->curr->sched_class) { + rq->curr->sched_class->check_preempt_curr(rq, p, flags); + } else { + for_each_class(class) { + if (class == rq->curr->sched_class) + break; + if (class == p->sched_class) { + resched_task(rq->curr); + break; + } + } + } + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + rq->skip_clock_update = 1; +} + +#ifdef CONFIG_SMP +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif +#endif + + trace_sched_migrate_task(p, new_cpu); + + if (task_cpu(p) != new_cpu) { + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + } + + __set_task_cpu(p, new_cpu); +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +static int migration_cpu_stop(void *data); + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + unsigned long flags; + int running, on_rq; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(p); + running = task_running(rq, p); + on_rq = p->on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &flags); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by both rq->lock and p->pi_lock + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + dest_cp |