diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 5592 |
1 files changed, 5592 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c new file mode 100644 index 00000000000..8e42de9105f --- /dev/null +++ b/kernel/sched/fair.c @@ -0,0 +1,5592 @@ +/* + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith <efault@gmx.de> + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/latencytop.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/slab.h> +#include <linux/profile.h> +#include <linux/interrupt.h> + +#include <trace/events/sched.h> + +#include "sched.h" + +/* + * Targeted preemption latency for CPU-bound tasks: + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + * + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. + * + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) + */ +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; + +/* + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 8; + +/* + * After fork, child runs first. If set to 0 (default) then + * parent will (try to) run first. + */ +unsigned int sysctl_sched_child_runs_first __read_mostly; + +/* + * SCHED_OTHER wake-up granularity. + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + +#ifdef CONFIG_CFS_BANDWIDTH +/* + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool + * each time a cfs_rq requests quota. + * + * Note: in the case that the slice exceeds the runtime remaining (either due + * to consumption or the quota being specified to be smaller than the slice) + * we will always only issue the remaining available time. + * + * default: 5 msec, units: microseconds + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } + + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +void sched_init_granularity(void) +{ + update_sysctl(); +} + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + + if (!lw->inv_weight) { + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; + } + + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + + +const struct sched_class fair_sched_class; + +/************************************************************** + * CFS operations on generic schedulable entities: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + if (se->cfs_rq == pse->cfs_rq) + return 1; + + return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + +static void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ + int se_depth, pse_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(*se); + pse_depth = depth_se(*pse); + + while (se_depth > pse_depth) { + se_depth--; + *se = parent_entity(*se); + } + + while (pse_depth > se_depth) { + pse_depth--; + *pse = parent_entity(*pse); + } + + while (!is_same_group(*se, *pse)) { + *se = parent_entity(*se); + *pse = parent_entity(*pse); + } +} + +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} + +#define entity_is_task(se) 1 + +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + +static inline void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec); + +/************************************************************** + * Scheduling class tree data structure manipulation methods: + */ + +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + u64 vruntime = cfs_rq->min_vruntime; + + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + + if (cfs_rq->rb_leftmost) { + struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, + struct sched_entity, + run_node); + + if (!cfs_rq->curr) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); + } + + cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node *parent = NULL; + struct sched_entity *entry; + int leftmost = 1; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_entity, run_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (entity_before(se, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used): + */ + if (leftmost) + cfs_rq->rb_leftmost = &se->run_node; + + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + } + + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); +} + +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *left = cfs_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_entity, run_node); +} + +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_entity, run_node); +} + +/************************************************************** + * Scheduling class statistics methods: + */ + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); + + if (ret || !write) + return ret; + + sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, + sysctl_sched_min_granularity); + +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); +#undef WRT_SYSCTL + + return 0; +} +#endif + +/* + * delta /= w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + + return delta; +} + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ +static u64 __sched_period(unsigned long nr_running) +{ + u64 period = sysctl_sched_latency; + unsigned long nr_latency = sched_nr_latency; + + if (unlikely(nr_running > nr_latency)) { + period = sysctl_sched_min_granularity; + period *= nr_running; + } + + return period; +} + +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*P[w/rw] + */ +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + + for_each_sched_entity(se) { + struct load_weight *load; + struct load_weight lw; + + cfs_rq = cfs_rq_of(se); + load = &cfs_rq->load; + + if (unlikely(!se->on_rq)) { + lw = cfs_rq->load; + + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; +} + +/* + * We calculate the vruntime slice of a to be inserted task + * + * vs = s/w + */ +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return calc_delta_fair(sched_slice(cfs_rq, se), se); +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq); + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned long delta_exec) +{ + unsigned long delta_exec_weighted; + + schedstat_set(curr->statistics.exec_max, + max((u64)delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + delta_exec_weighted = calc_delta_fair(delta_exec, curr); + + curr->vruntime += delta_exec_weighted; + update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED + cfs_rq->load_unacc_exec_time += delta_exec; +#endif +} + +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + u64 now = rq_of(cfs_rq)->clock_task; + unsigned long delta_exec; + + if (unlikely(!curr)) + return; + + /* + * Get the amount of time the current task was running + * since the last time we changed load (this cannot + * overflow on 32 bits): + */ + delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; + + __update_curr(cfs_rq, curr, delta_exec); + curr->exec_start = now; + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); + + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); + } + + account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); +} + +/* + * Task is being enqueued - update stats: + */ +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq->curr) + update_stats_wait_start(cfs_rq, se); +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, + rq_of(cfs_rq)->clock - se->statistics.wait_start)); + schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); + schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + + rq_of(cfs_rq)->clock - se->statistics.wait_start); +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->statistics.wait_start); + } +#endif + schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq->curr) + update_stats_wait_end(cfs_rq, se); +} + +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * We are starting a new run period: + */ + se->exec_start = rq_of(cfs_rq)->clock_task; +} + +/************************************************** + * Scheduling class queueing methods: + */ + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + update_load_add(&rq_of(cfs_rq)->load, se->load.weight); + if (entity_is_task(se)) { + add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } + cfs_rq->nr_running++; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); + if (entity_is_task(se)) { + add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } + cfs_rq->nr_running--; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* we need this in update_cfs_load and load-balance functions below */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); +# ifdef CONFIG_SMP +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, + int global_update) +{ + struct task_group *tg = cfs_rq->tg; + long load_avg; + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + + if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + } +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ + u64 period = sysctl_sched_shares_window; + u64 now, delta; + unsigned long load = cfs_rq->load.weight; + + if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) + return; + + now = rq_of(cfs_rq)->clock_task; + delta = now - cfs_rq->load_stamp; + + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + delta = period - 1; + } + + cfs_rq->load_stamp = now; + cfs_rq->load_unacc_exec_time = 0; + cfs_rq->load_period += delta; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } + + /* consider updating load contribution on each fold or truncate */ + if (global_update || cfs_rq->load_period > period + || !cfs_rq->load_period) + update_cfs_rq_load_contribution(cfs_rq, global_update); + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } + + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); +} + +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ + long tg_weight; + + /* + * Use this CPU's actual weight instead of the last load_contribution + * to gain a more accurate current total weight. See + * update_cfs_rq_load_contribution(). + */ + tg_weight = atomic_read(&tg->load_weight); + tg_weight -= cfs_rq->load_contribution; + tg_weight += cfs_rq->load.weight; + + return tg_weight; +} + +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + long tg_weight, load, shares; + + tg_weight = calc_tg_weight(tg, cfs_rq); + load = cfs_rq->load.weight; + + shares = (tg->shares * load); + if (tg_weight) + shares /= tg_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + } + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + long shares; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se || throttled_hierarchy(cfs_rq)) + return; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg); + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + + if (se->statistics.sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; + + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (se->statistics.block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; + + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + if (tsk->in_iowait) { + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } +#endif +} + +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ + u64 vruntime = cfs_rq->min_vruntime; + + /* + * The 'current' period is already promised to the current tasks, + * however the extra weight of the new task will slow them down a + * little, place the new task so that it fits in the slot that + * stays open at the end. + */ + if (initial && sched_feat(START_DEBIT)) + vruntime += sched_vslice(cfs_rq, se); + + /* sleeps up to a single latency don't count. */ + if (!initial) { + unsigned long thresh = sysctl_sched_latency; + + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + vruntime -= thresh; + } + + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + + se->vruntime = vruntime; +} + +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + se->vruntime += cfs_rq->min_vruntime; + + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + update_cfs_load(cfs_rq, 0); + account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); + + if (flags & ENQUEUE_WAKEUP) { + place_entity(cfs_rq, se, 0); + enqueue_sleeper(cfs_rq, se); + } + + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) { + list_add_leaf_cfs_rq(cfs_rq); + check_enqueue_throttle(cfs_rq); + } +} + +static void __clear_buddies_last(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} + +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } +} + +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } +} + +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); +} + +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + update_stats_dequeue(cfs_rq, se); + if (flags & DEQUEUE_SLEEP) { +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_of(cfs_rq)->clock; + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_of(cfs_rq)->clock; + } +#endif + } + + clear_buddies(cfs_rq, se); + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq, 0); + account_entity_dequeue(cfs_rq, se); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!(flags & DEQUEUE_SLEEP)) + se->vruntime -= cfs_rq->min_vruntime; + + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; + s64 delta; + + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { + resched_task(rq_of(cfs_rq)->curr); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that m |