aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-11-15 17:14:39 +0100
committerIngo Molnar <mingo@elte.hu>2011-11-17 12:20:22 +0100
commit391e43da797a96aeb65410281891f6d0b0e9611c (patch)
tree0ce6784525a5a8f75b377170cf1a7d60abccea29 /kernel/sched_fair.c
parent029632fbb7b7c9d85063cc9eb470de6c54873df3 (diff)
sched: Move all scheduler bits into kernel/sched/
There's too many sched*.[ch] files in kernel/, give them their own directory. (No code changed, other than Makefile glue added.) Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c5601
1 files changed, 0 insertions, 5601 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
deleted file mode 100644
index cd3b64219d9..00000000000
--- a/kernel/sched_fair.c
+++ /dev/null
@@ -1,5601 +0,0 @@
-/*
- * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
- *
- * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * Interactivity improvements by Mike Galbraith
- * (C) 2007 Mike Galbraith <efault@gmx.de>
- *
- * Various enhancements by Dmitry Adamushko.
- * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
- *
- * Group scheduling enhancements by Srivatsa Vaddagiri
- * Copyright IBM Corporation, 2007
- * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
- *
- * Scaled math optimizations by Thomas Gleixner
- * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
- *
- * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-
-#include <linux/latencytop.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-#include <linux/profile.h>
-#include <linux/interrupt.h>
-
-#include <trace/events/sched.h>
-
-#include "sched.h"
-
-/*
- * Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
- * The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
- *
- * Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
- */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
- = SCHED_TUNABLESCALING_LOG;
-
-/*
- * Minimal preemption granularity for CPU-bound tasks:
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
-
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
-/*
- * SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-
-/*
- * The exponential sliding window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
- */
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
-
-#ifdef CONFIG_CFS_BANDWIDTH
-/*
- * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
- * each time a cfs_rq requests quota.
- *
- * Note: in the case that the slice exceeds the runtime remaining (either due
- * to consumption or the quota being specified to be smaller than the slice)
- * we will always only issue the remaining available time.
- *
- * default: 5 msec, units: microseconds
- */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
-#endif
-
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static int get_update_sysctl_factor(void)
-{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
- unsigned int factor;
-
- switch (sysctl_sched_tunable_scaling) {
- case SCHED_TUNABLESCALING_NONE:
- factor = 1;
- break;
- case SCHED_TUNABLESCALING_LINEAR:
- factor = cpus;
- break;
- case SCHED_TUNABLESCALING_LOG:
- default:
- factor = 1 + ilog2(cpus);
- break;
- }
-
- return factor;
-}
-
-static void update_sysctl(void)
-{
- unsigned int factor = get_update_sysctl_factor();
-
-#define SET_SYSCTL(name) \
- (sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
-#undef SET_SYSCTL
-}
-
-void sched_init_granularity(void)
-{
- update_sysctl();
-}
-
-#if BITS_PER_LONG == 32
-# define WMULT_CONST (~0UL)
-#else
-# define WMULT_CONST (1UL << 32)
-#endif
-
-#define WMULT_SHIFT 32
-
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- struct load_weight *lw)
-{
- u64 tmp;
-
- /*
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- * 2^SCHED_LOAD_RESOLUTION.
- */
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- tmp = (u64)delta_exec * scale_load_down(weight);
- else
- tmp = (u64)delta_exec;
-
- if (!lw->inv_weight) {
- unsigned long w = scale_load_down(lw->weight);
-
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- lw->inv_weight = 1;
- else if (unlikely(!w))
- lw->inv_weight = WMULT_CONST;
- else
- lw->inv_weight = WMULT_CONST / w;
- }
-
- /*
- * Check whether we'd overflow the 64-bit multiplication:
- */
- if (unlikely(tmp > WMULT_CONST))
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- WMULT_SHIFT/2);
- else
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
-}
-
-
-const struct sched_class fair_sched_class;
-
-/**************************************************************
- * CFS operations on generic schedulable entities:
- */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se) (!se->my_q)
-
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!entity_is_task(se));
-#endif
- return container_of(se, struct task_struct, se);
-}
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
- for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
- if (!cfs_rq->on_list) {
- /*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
- */
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- }
-
- cfs_rq->on_list = 1;
- }
-}
-
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->on_list) {
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- cfs_rq->on_list = 0;
- }
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- if (se->cfs_rq == pse->cfs_rq)
- return 1;
-
- return 0;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
- return se->parent;
-}
-
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
-static void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
- int se_depth, pse_depth;
-
- /*
- * preemption test can be made between sibling entities who are in the
- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
- * both tasks until we find their ancestors who are siblings of common
- * parent.
- */
-
- /* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
-
- while (se_depth > pse_depth) {
- se_depth--;
- *se = parent_entity(*se);
- }
-
- while (pse_depth > se_depth) {
- pse_depth--;
- *pse = parent_entity(*pse);
- }
-
- while (!is_same_group(*se, *pse)) {
- *se = parent_entity(*se);
- *pse = parent_entity(*pse);
- }
-}
-
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
-
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return container_of(cfs_rq, struct rq, cfs);
-}
-
-#define entity_is_task(se) 1
-
-#define for_each_sched_entity(se) \
- for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
-
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-}
-
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- return 1;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
- return NULL;
-}
-
-static inline void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
-}
-
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec);
-
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
- */
-
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
-{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0)
- min_vruntime = vruntime;
-
- return min_vruntime;
-}
-
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
-{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
- min_vruntime = vruntime;
-
- return min_vruntime;
-}
-
-static inline int entity_before(struct sched_entity *a,
- struct sched_entity *b)
-{
- return (s64)(a->vruntime - b->vruntime) < 0;
-}
-
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = cfs_rq->min_vruntime;
-
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
-
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
-
- if (!cfs_rq->curr)
- vruntime = se->vruntime;
- else
- vruntime = min_vruntime(vruntime, se->vruntime);
- }
-
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-
-/*
- * Enqueue an entity into the rb-tree:
- */
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- int leftmost = 1;
-
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
-
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-}
-
-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-}
-
-struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
-{
- struct rb_node *left = cfs_rq->rb_leftmost;
-
- if (!left)
- return NULL;
-
- return rb_entry(left, struct sched_entity, run_node);
-}
-
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
-{
- struct rb_node *next = rb_next(&se->run_node);
-
- if (!next)
- return NULL;
-
- return rb_entry(next, struct sched_entity, run_node);
-}
-
-#ifdef CONFIG_SCHED_DEBUG
-struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
-{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
-
- if (!last)
- return NULL;
-
- return rb_entry(last, struct sched_entity, run_node);
-}
-
-/**************************************************************
- * Scheduling class statistics methods:
- */
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int factor = get_update_sysctl_factor();
-
- if (ret || !write)
- return ret;
-
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
-#define WRT_SYSCTL(name) \
- (normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
-#undef WRT_SYSCTL
-
- return 0;
-}
-#endif
-
-/*
- * delta /= w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
-
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
-
- return period;
-}
-
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
-
- cfs_rq = cfs_rq_of(se);
- load = &cfs_rq->load;
-
- if (unlikely(!se->on_rq)) {
- lw = cfs_rq->load;
-
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = calc_delta_mine(slice, se->load.weight, load);
- }
- return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to be inserted task
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
- unsigned long delta_exec)
-{
- unsigned long delta_exec_weighted;
-
- schedstat_set(curr->statistics.exec_max,
- max((u64)delta_exec, curr->statistics.exec_max));
-
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
- curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
-}
-
-static void update_curr(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock_task;
- unsigned long delta_exec;
-
- if (unlikely(!curr))
- return;
-
- /*
- * Get the amount of time the current task was running
- * since the last time we changed load (this cannot
- * overflow on 32 bits):
- */
- delta_exec = (unsigned long)(now - curr->exec_start);
- if (!delta_exec)
- return;
-
- __update_curr(cfs_rq, curr, delta_exec);
- curr->exec_start = now;
-
- if (entity_is_task(curr)) {
- struct task_struct *curtask = task_of(curr);
-
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cpuacct_charge(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
- }
-
- account_cfs_rq_runtime(cfs_rq, delta_exec);
-}
-
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
-}
-
-/*
- * Task is being enqueued - update stats:
- */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- /*
- * Are we enqueueing a waiting task? (for current tasks
- * a dequeue/enqueue event is a NOP)
- */
- if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
-}
-
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_of(cfs_rq)->clock - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- /*
- * Mark the end of the wait period if dequeueing a
- * waiting task:
- */
- if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
-}
-
-/*
- * We are picking a new current task - update its stats:
- */
-static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- /*
- * We are starting a new run period:
- */
- se->exec_start = rq_of(cfs_rq)->clock_task;
-}
-
-/**************************************************
- * Scheduling class queueing methods:
- */
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
- cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
-static void
-account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- add_cfs_task_weight(cfs_rq, se->load.weight);
- list_add(&se->group_node, &cfs_rq->tasks);
- }
- cfs_rq->nr_running++;
-}
-
-static void
-account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- add_cfs_task_weight(cfs_rq, -se->load.weight);
- list_del_init(&se->group_node);
- }
- cfs_rq->nr_running--;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
-
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
- }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
-
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- return;
-
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
-
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
- }
-
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
- }
-
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
-
- while (cfs_rq->load_period > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
- }
-
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
-}
-
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
-{
- long tg_weight;
-
- /*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
- */
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
- tg_weight += cfs_rq->load.weight;
-
- return tg_weight;
-}
-
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
-
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
-
- shares = (tg->shares * load);
- if (tg_weight)
- shares /= tg_weight;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
-}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
-# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- return tg->shares;
-}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
-# endif /* CONFIG_SMP */
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight)
-{
- if (se->on_rq) {
- /* commit outstanding execution time */
- if (cfs_rq->curr == se)
- update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
- }
-
- update_load_set(&se->load, weight);
-
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
-}
-
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
-{
- struct task_group *tg;
- struct sched_entity *se;
- long shares;
-
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
- return;
-#ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
- return;
-#endif
- shares = calc_cfs_shares(cfs_rq, tg);
-
- reweight_entity(cfs_rq_of(se), se, shares);
-}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
-{
-}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
- struct task_struct *tsk = NULL;
-
- if (entity_is_task(se))
- tsk = task_of(se);
-
- if (se->statistics.sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.sleep_max))
- se->statistics.sleep_max = delta;
-
- se->statistics.sleep_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (se->statistics.block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.block_max))
- se->statistics.block_max = delta;
-
- se->statistics.block_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- if (tsk->in_iowait) {
- se->statistics.iowait_sum += delta;
- se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
- }
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
-#endif
-}
-
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq, nr_spread_over);
-#endif
-}
-
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
-
- /*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
- */
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
-
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh = sysctl_sched_latency;
-
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
-
- /* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
-
- se->vruntime = vruntime;
-}
-
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-
-static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
- /*
- * Update the normalized vruntime before updating min_vruntime
- * through callig update_curr().
- */
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
- account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
-
- if (flags & ENQUEUE_WAKEUP) {
- place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
- }
-
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- if (se != cfs_rq->curr)
- __enqueue_entity(cfs_rq, se);
- se->on_rq = 1;
-
- if (cfs_rq->nr_running == 1) {
- list_add_leaf_cfs_rq(cfs_rq);
- check_enqueue_throttle(cfs_rq);
- }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
- break;
- }
-}
-
-static void __clear_buddies_next(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
- break;
- }
-}
-
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
- break;
- }
-}
-
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
-
- if (cfs_rq->next == se)
- __clear_buddies_next(se);
-
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
-}
-
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
-static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
-
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_of(cfs_rq)->clock;
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_of(cfs_rq)->clock;
- }
-#endif
- }
-
- clear_buddies(cfs_rq, se);
-
- if (se != cfs_rq->curr)
- __dequeue_entity(cfs_rq, se);
- se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
- account_entity_dequeue(cfs_rq, se);
-
- /*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized positi