diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-11-15 17:14:39 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-11-17 12:20:22 +0100 |
commit | 391e43da797a96aeb65410281891f6d0b0e9611c (patch) | |
tree | 0ce6784525a5a8f75b377170cf1a7d60abccea29 /kernel/sched_fair.c | |
parent | 029632fbb7b7c9d85063cc9eb470de6c54873df3 (diff) |
sched: Move all scheduler bits into kernel/sched/
There's too many sched*.[ch] files in kernel/, give them their own
directory.
(No code changed, other than Makefile glue added.)
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 5601 |
1 files changed, 0 insertions, 5601 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c deleted file mode 100644 index cd3b64219d9..00000000000 --- a/kernel/sched_fair.c +++ /dev/null @@ -1,5601 +0,0 @@ -/* - * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) - * - * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * Interactivity improvements by Mike Galbraith - * (C) 2007 Mike Galbraith <efault@gmx.de> - * - * Various enhancements by Dmitry Adamushko. - * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> - * - * Group scheduling enhancements by Srivatsa Vaddagiri - * Copyright IBM Corporation, 2007 - * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> - * - * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> - * - * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - */ - -#include <linux/latencytop.h> -#include <linux/sched.h> -#include <linux/cpumask.h> -#include <linux/slab.h> -#include <linux/profile.h> -#include <linux/interrupt.h> - -#include <trace/events/sched.h> - -#include "sched.h" - -/* - * Targeted preemption latency for CPU-bound tasks: - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - */ -unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; - -/* - * The initial- and re-scaling of tunables is configurable - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) - * - * Options are: - * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) - * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus - */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling - = SCHED_TUNABLESCALING_LOG; - -/* - * Minimal preemption granularity for CPU-bound tasks: - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - -/* - * is kept at sysctl_sched_latency / sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 8; - -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - -/* - * SCHED_OTHER wake-up granularity. - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; - -/* - * The exponential sliding window over which load is averaged for shares - * distribution. - * (default: 10msec) - */ -unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; - -#ifdef CONFIG_CFS_BANDWIDTH -/* - * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool - * each time a cfs_rq requests quota. - * - * Note: in the case that the slice exceeds the runtime remaining (either due - * to consumption or the quota being specified to be smaller than the slice) - * we will always only issue the remaining available time. - * - * default: 5 msec, units: microseconds - */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; -#endif - -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static int get_update_sysctl_factor(void) -{ - unsigned int cpus = min_t(int, num_online_cpus(), 8); - unsigned int factor; - - switch (sysctl_sched_tunable_scaling) { - case SCHED_TUNABLESCALING_NONE: - factor = 1; - break; - case SCHED_TUNABLESCALING_LINEAR: - factor = cpus; - break; - case SCHED_TUNABLESCALING_LOG: - default: - factor = 1 + ilog2(cpus); - break; - } - - return factor; -} - -static void update_sysctl(void) -{ - unsigned int factor = get_update_sysctl_factor(); - -#define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); -#undef SET_SYSCTL -} - -void sched_init_granularity(void) -{ - update_sysctl(); -} - -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; - - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; - } - - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - - -const struct sched_class fair_sched_class; - -/************************************************************** - * CFS operations on generic schedulable entities: - */ - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - -static inline struct task_struct *task_of(struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(!entity_is_task(se)); -#endif - return container_of(se, struct task_struct, se); -} - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ - if (!cfs_rq->on_list) { - /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. - */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } - - cfs_rq->on_list = 1; - } -} - -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->on_list) { - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - cfs_rq->on_list = 0; - } -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) - -/* Do the two (enqueued) entities belong to the same group ? */ -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - if (se->cfs_rq == pse->cfs_rq) - return 1; - - return 0; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return se->parent; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -static void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ - int se_depth, pse_depth; - - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); - - while (se_depth > pse_depth) { - se_depth--; - *se = parent_entity(*se); - } - - while (pse_depth > se_depth) { - pse_depth--; - *pse = parent_entity(*pse); - } - - while (!is_same_group(*se, *pse)) { - *se = parent_entity(*se); - *pse = parent_entity(*pse); - } -} - -#else /* !CONFIG_FAIR_GROUP_SCHED */ - -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - -#define entity_is_task(se) 1 - -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ -} - -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ -} - -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) - -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return NULL; -} - -static inline void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ -} - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec); - -/************************************************************** - * Scheduling class tree data structure manipulation methods: - */ - -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) - min_vruntime = vruntime; - - return min_vruntime; -} - -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) - min_vruntime = vruntime; - - return min_vruntime; -} - -static inline int entity_before(struct sched_entity *a, - struct sched_entity *b) -{ - return (s64)(a->vruntime - b->vruntime) < 0; -} - -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = cfs_rq->min_vruntime; - - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; - - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); - - if (!cfs_rq->curr) - vruntime = se->vruntime; - else - vruntime = min_vruntime(vruntime, se->vruntime); - } - - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - -/* - * Enqueue an entity into the rb-tree: - */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - int leftmost = 1; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = 0; - } - } - - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); -} - -static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); -} - -struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) -{ - struct rb_node *left = cfs_rq->rb_leftmost; - - if (!left) - return NULL; - - return rb_entry(left, struct sched_entity, run_node); -} - -static struct sched_entity *__pick_next_entity(struct sched_entity *se) -{ - struct rb_node *next = rb_next(&se->run_node); - - if (!next) - return NULL; - - return rb_entry(next, struct sched_entity, run_node); -} - -#ifdef CONFIG_SCHED_DEBUG -struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) -{ - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - - if (!last) - return NULL; - - return rb_entry(last, struct sched_entity, run_node); -} - -/************************************************************** - * Scheduling class statistics methods: - */ - -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int factor = get_update_sysctl_factor(); - - if (ret || !write) - return ret; - - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - -#define WRT_SYSCTL(name) \ - (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); - WRT_SYSCTL(sched_latency); - WRT_SYSCTL(sched_wakeup_granularity); -#undef WRT_SYSCTL - - return 0; -} -#endif - -/* - * delta /= w - */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) -{ - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); - - return delta; -} - -/* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl - */ -static u64 __sched_period(unsigned long nr_running) -{ - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; -} - -/* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*P[w/rw] - */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - - for_each_sched_entity(se) { - struct load_weight *load; - struct load_weight lw; - - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; - - if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; - - update_load_add(&lw, se->load.weight); - load = &lw; - } - slice = calc_delta_mine(slice, se->load.weight, load); - } - return slice; -} - -/* - * We calculate the vruntime slice of a to be inserted task - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif -} - -static void update_curr(struct cfs_rq *cfs_rq) -{ - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; - unsigned long delta_exec; - - if (unlikely(!curr)) - return; - - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) - return; - - __update_curr(cfs_rq, curr, delta_exec); - curr->exec_start = now; - - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); - } - - account_cfs_rq_runtime(cfs_rq, delta_exec); -} - -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); -} - -/* - * Task is being enqueued - update stats: - */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * Are we enqueueing a waiting task? (for current tasks - * a dequeue/enqueue event is a NOP) - */ - if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); -} - -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * Mark the end of the wait period if dequeueing a - * waiting task: - */ - if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); -} - -/* - * We are picking a new current task - update its stats: - */ -static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * We are starting a new run period: - */ - se->exec_start = rq_of(cfs_rq)->clock_task; -} - -/************************************************** - * Scheduling class queueing methods: - */ - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - -static void -account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } - cfs_rq->nr_running++; -} - -static void -account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); - list_del_init(&se->group_node); - } - cfs_rq->nr_running--; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -# ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) -{ - long tg_weight; - - /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). - */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} - -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; - - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; - - shares = (tg->shares * load); - if (tg_weight) - shares /= tg_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; - - return shares; -} - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} -# else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - return tg->shares; -} - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} -# endif /* CONFIG_SMP */ -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) -{ - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - } - - update_load_set(&se->load, weight); - - if (se->on_rq) - account_entity_enqueue(cfs_rq, se); -} - -static void update_cfs_shares(struct cfs_rq *cfs_rq) -{ - struct task_group *tg; - struct sched_entity *se; - long shares; - - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) - return; -#ifndef CONFIG_SMP - if (likely(se->load.weight == tg->shares)) - return; -#endif - shares = calc_cfs_shares(cfs_rq, tg); - - reweight_entity(cfs_rq_of(se), se, shares); -} -#else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) -{ -} - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; - - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); -#endif -} - -static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -{ - u64 vruntime = cfs_rq->min_vruntime; - - /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. - */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); - - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh = sysctl_sched_latency; - - /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: - */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - - vruntime -= thresh; - } - - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; -} - -static void check_enqueue_throttle(struct cfs_rq *cfs_rq); - -static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ - /* - * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). - */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) - se->vruntime += cfs_rq->min_vruntime; - - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); - account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); - - if (flags & ENQUEUE_WAKEUP) { - place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); - } - - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); - se->on_rq = 1; - - if (cfs_rq->nr_running == 1) { - list_add_leaf_cfs_rq(cfs_rq); - check_enqueue_throttle(cfs_rq); - } -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else - break; - } -} - -static void __clear_buddies_next(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else - break; - } -} - -static void __clear_buddies_skip(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else - break; - } -} - -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->last == se) - __clear_buddies_last(se); - - if (cfs_rq->next == se) - __clear_buddies_next(se); - - if (cfs_rq->skip == se) - __clear_buddies_skip(se); -} - -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); - -static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; - } -#endif - } - - clear_buddies(cfs_rq, se); - - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - update_cfs_load(cfs_rq, 0); - account_entity_dequeue(cfs_rq, se); - - /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized positi |