diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-11-15 17:14:39 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-11-17 12:20:22 +0100 |
commit | 391e43da797a96aeb65410281891f6d0b0e9611c (patch) | |
tree | 0ce6784525a5a8f75b377170cf1a7d60abccea29 /kernel/sched.c | |
parent | 029632fbb7b7c9d85063cc9eb470de6c54873df3 (diff) |
sched: Move all scheduler bits into kernel/sched/
There's too many sched*.[ch] files in kernel/, give them their own
directory.
(No code changed, other than Makefile glue added.)
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 8101 |
1 files changed, 0 insertions, 8101 deletions
diff --git a/kernel/sched.c b/kernel/sched.c deleted file mode 100644 index 2ffcceed886..00000000000 --- a/kernel/sched.c +++ /dev/null @@ -1,8101 +0,0 @@ -/* - * kernel/sched.c - * - * Kernel scheduler and related syscalls - * - * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz - */ - -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/uaccess.h> -#include <linux/highmem.h> -#include <asm/mmu_context.h> -#include <linux/interrupt.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/kernel_stat.h> -#include <linux/debug_locks.h> -#include <linux/perf_event.h> -#include <linux/security.h> -#include <linux/notifier.h> -#include <linux/profile.h> -#include <linux/freezer.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/pid_namespace.h> -#include <linux/smp.h> -#include <linux/threads.h> -#include <linux/timer.h> -#include <linux/rcupdate.h> -#include <linux/cpu.h> -#include <linux/cpuset.h> -#include <linux/percpu.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/sysctl.h> -#include <linux/syscalls.h> -#include <linux/times.h> -#include <linux/tsacct_kern.h> -#include <linux/kprobes.h> -#include <linux/delayacct.h> -#include <linux/unistd.h> -#include <linux/pagemap.h> -#include <linux/hrtimer.h> -#include <linux/tick.h> -#include <linux/debugfs.h> -#include <linux/ctype.h> -#include <linux/ftrace.h> -#include <linux/slab.h> -#include <linux/init_task.h> - -#include <asm/tlb.h> -#include <asm/irq_regs.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif - -#include "sched.h" -#include "workqueue_sched.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> - -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - unsigned long delta; - ktime_t soft, hard, now; - - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } -} - -DEFINE_MUTEX(sched_domains_mutex); -DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -static void update_rq_clock_task(struct rq *rq, s64 delta); - -void update_rq_clock(struct rq *rq) -{ - s64 delta; - - if (rq->skip_clock_update > 0) - return; - - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - rq->clock += delta; - update_rq_clock_task(rq, delta); -} - -/* - * Debugging: various feature bits - */ - -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | - -const_debug unsigned int sysctl_sched_features = -#include "sched_features.h" - 0; - -#undef SCHED_FEAT - -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static __read_mostly char *sched_feat_names[] = { -#include "sched_features.h" - NULL -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; sched_feat_names[i]; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int neg = 0; - int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; sched_feat_names[i]; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) - sysctl_sched_features &= ~(1UL << i); - else - sysctl_sched_features |= (1UL << i); - break; - } - } - - if (!sched_feat_names[i]) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); - -#endif - -/* - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ -const_debug unsigned int sysctl_sched_nr_migrate = 32; - -/* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - -/* - * period over which we measure -rt task cpu usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - -__read_mostly int scheduler_running; - -/* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - - - -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} - -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - -#ifdef CONFIG_SCHED_HRTICK -/* - * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. - */ - -static void hrtick_clear(struct rq *rq) -{ - if (hrtimer_active(&rq->hrtick_timer)) - hrtimer_cancel(&rq->hrtick_timer); -} - -/* - * High-resolution timer tick. - * Runs from hardirq context with interrupts disabled. - */ -static enum hrtimer_restart hrtick(struct hrtimer *timer) -{ - struct rq *rq = container_of(timer, struct rq, hrtick_timer); - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - raw_spin_lock(&rq->lock); - update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); - - return HRTIMER_NORESTART; -} - -#ifdef CONFIG_SMP -/* - * called from hardirq (IPI) context - */ -static void __hrtick_start(void *arg) -{ - struct rq *rq = arg; - - raw_spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); - rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); -} - -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - - hrtimer_set_expires(timer, time); - - if (rq == this_rq()) { - hrtimer_restart(timer); - } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); - rq->hrtick_csd_pending = 1; - } -} - -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static inline void init_hrtick(void) -{ -} -#endif /* CONFIG_SMP */ - -static void init_rq_hrtick(struct rq *rq) -{ -#ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; -#endif - - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; -} -#else /* CONFIG_SCHED_HRTICK */ -static inline void hrtick_clear(struct rq *rq) -{ -} - -static inline void init_rq_hrtick(struct rq *rq) -{ -} - -static inline void init_hrtick(void) -{ -} -#endif /* CONFIG_SCHED_HRTICK */ - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -void resched_task(struct task_struct *p) -{ - int cpu; - - assert_raw_spin_locked(&task_rq(p)->lock); - - if (test_tsk_need_resched(p)) - return; - - set_tsk_need_resched(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} - -void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -#ifdef CONFIG_NO_HZ -/* - * In the semi idle case, use the nearest busy cpu for migrating timers - * from an idle cpu. This is good for power-savings. - * - * We don't do similar optimization for completely idle system, as - * selecting an idle cpu will add more delays to the timers than intended - * (as that cpu's timer base may not be uptodate wrt jiffies etc). - */ -int get_nohz_timer_target(void) -{ - int cpu = smp_processor_id(); - int i; - struct sched_domain *sd; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } - } - } -unlock: - rcu_read_unlock(); - return cpu; -} -/* - * When add_timer_on() enqueues a timer into the timer wheel of an - * idle CPU then this timer might expire before the next timer event - * which is scheduled to wake up that CPU. In case of a completely - * idle system the next event might even be infinite time into the - * future. wake_up_idle_cpu() ensures that the CPU is woken up and - * leaves the inner idle loop so the newly added timer is taken into - * account when the CPU goes back to idle and evaluates the timer - * wheel for the next timer event. - */ -void wake_up_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (cpu == smp_processor_id()) - return; - - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) - smp_send_reschedule(cpu); -} - -static inline bool got_nohz_idle_kick(void) -{ - return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; -} - -#else /* CONFIG_NO_HZ */ - -static inline bool got_nohz_idle_kick(void) -{ - return false; -} - -#endif /* CONFIG_NO_HZ */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq->clock - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif /* CONFIG_SMP */ - -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) -/* - * Iterate task_group tree rooted at *from, calling @down when first entering a - * node and @up when leaving it for the final time. - * - * Caller must hold rcu_lock or sufficient equivalent. - */ -int walk_tg_tree_from(struct task_group *from, - tg_visitor down, tg_visitor up, void *data) -{ - struct task_group *parent, *child; - int ret; - - parent = from; - -down: - ret = (*down)(parent, data); - if (ret) - goto out; - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; - } - ret = (*up)(parent, data); - if (ret || parent == from) - goto out; - - child = parent; - parent = parent->parent; - if (parent) - goto up; -out: - return ret; -} - -int tg_nop(struct task_group *tg, void *data) -{ - return 0; -} -#endif - -void update_cpu_load(struct rq *this_rq); - -static void set_load_weight(struct task_struct *p) -{ - int prio = p->static_prio - MAX_RT_PRIO; - struct load_weight *load = &p->se.load; - - /* - * SCHED_IDLE tasks get minimal weight: - */ - if (p->policy == SCHED_IDLE) { - load->weight = scale_load(WEIGHT_IDLEPRIO); - load->inv_weight = WMULT_IDLEPRIO; - return; - } - - load->weight = scale_load(prio_to_weight[prio]); - load->inv_weight = prio_to_wmult[prio]; -} - -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -{ - update_rq_clock(rq); - sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, flags); -} - -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -{ - update_rq_clock(rq); - sched_info_dequeued(p); - p->sched_class->dequeue_task(rq, p, flags); -} - -/* - * activate_task - move a task to the runqueue. - */ -void activate_task(struct rq *rq, struct task_struct *p, int flags) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, flags); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -void deactivate_task(struct rq *rq, struct task_struct *p, int flags) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - - dequeue_task(rq, p, flags); -} - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ -/* - * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... - */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { - u64 st; - - steal = paravirt_steal_clock(cpu_of(rq)); - steal -= rq->prev_steal_time_rq; - - if (unlikely(steal > delta)) - steal = delta; - - st = steal_ticks(steal); - steal = st * TICK_NSEC; - - rq->prev_steal_time_rq += steal; - - delta -= steal; - } -#endif - - rq->clock_task += delta; - -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) - sched_rt_avg_update(rq, irq_delta + steal); -#endif -} - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) -{ - return p->static_prio; -} - -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - int prio; - - if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} - -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) - p->sched_class->prio_changed(rq, p, oldprio); -} - -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) -{ - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { - rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_task(rq->curr); - break; - } - } - } - - /* - * A queue event has occurred, and we're going to schedule. In - * this case, we can save a useless back to back clock update. - */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; -} - -#ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -{ -#ifdef CONFIG_SCHED_DEBUG - /* - * We should never call set_task_cpu() on a blocked task, - * ttwu() will sort out the placement. - */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); - -#ifdef CONFIG_LOCKDEP - /* - * The caller should hold either p->pi_lock or rq->lock, when changing - * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. - * - * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). - * - * Furthermore, all task_rq users should acquire both locks, see - * task_rq_lock(). - */ - WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); -#endif -#endif - - trace_sched_migrate_task(p, new_cpu); - - if (task_cpu(p) != new_cpu) { - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); - } - - __set_task_cpu(p, new_cpu); -} - -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); - -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - unsigned long flags; - int running, on_rq; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - trace_sched_wait_task(p); - running = task_running(rq, p); - on_rq = p->on_rq; - ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesn't have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p) |