diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 247 |
1 files changed, 171 insertions, 76 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd39c4b..f2aa987027d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -429,12 +429,38 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; + unsigned long rw = cfs_rq_of(se)->load.weight; + +#ifdef CONFIG_FAIR_SCHED_GROUP + struct cfs_rq *cfs_rq = se->my_q; + struct task_group *tg = NULL + + if (cfs_rq) + tg = cfs_rq->tg; + + if (tg && tg->shares < NICE_0_LOAD) { + /* + * scale shares to what it would have been had + * tg->weight been NICE_0_LOAD: + * + * weight = 1024 * shares / tg->weight + */ + lw.weight *= se->load.weight; + lw.weight /= tg->shares; + + lw.inv_weight = 0; - if (se->load.weight < NICE_0_LOAD) se_lw = &lw; + rw += lw.weight - se->load.weight; + } else +#endif - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, se_lw); + if (se->load.weight < NICE_0_LOAD) { + se_lw = &lw; + rw += NICE_0_LOAD - se->load.weight; + } + + delta = calc_delta_mine(delta, rw, se_lw); } return delta; @@ -700,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (!se->last_wakeup) - return; - - update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); - se->last_wakeup = 0; -} - static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -725,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { - update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -787,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!cfs_rq->next) - return se; + struct rq *rq = rq_of(cfs_rq); + u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (wakeup_preempt_entity(cfs_rq->next, se) != 0) + if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + cfs_rq->pair_start = rq->clock; return se; + } return cfs_rq->next; } @@ -896,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta, requeue); } } -#else +#else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -1037,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p) } return cpu; } -#else +#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ static inline int wake_idle(int cpu, struct task_struct *p) { return cpu; @@ -1048,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * The problem is that perfectly aligning the shares is rather expensive, hence + * we try to avoid doing that too often - see update_shares(), which ratelimits + * this change. + * + * We compensate this by not only taking the current delta into account, but + * also considering the delta between when the shares were last adjusted and + * now. + * + * We still saw a performance dip, some tracing learned us that between + * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased + * significantly. Therefore try to bias the error in direction of failing + * the affine wakeup. + * + */ +static long effective_load(struct task_group *tg, int cpu, + long wl, long wg) +{ + struct sched_entity *se = tg->se[cpu]; + long more_w; + + if (!tg->parent) + return wl; + + /* + * By not taking the decrease of shares on the other cpu into + * account our error leans towards reducing the affine wakeups. + */ + if (!wl && sched_feat(ASYM_EFF_LOAD)) + return wl; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; + + for_each_sched_entity(se) { +#define D(n) (likely(n) ? (n) : 1) + + long S, rw, s, a, b; + + S = se->my_q->tg->shares; + s = se->my_q->shares; + rw = se->my_q->rq_weight; + + a = S*(rw + wl); + b = S*rw + s*wg; + + wl = s*(a-b)/D(b); + /* + * Assume the group is already running and will + * thus already be accounted for in the weight. + * + * That is, moving shares between CPUs, does not + * alter the group weight. + */ + wg = 0; +#undef D + } + + return wl; +} + +#else + +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) +{ + return wl; +} + +#endif + static int wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, @@ -1055,36 +1147,50 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, unsigned int imbalance) { struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + unsigned long weight; + int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE)) + if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; + + tl += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.load.weight; + + balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); + + /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly * woken task: */ - if (sync && curr->sched_class == &fair_sched_class) { + if (sync && balanced) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) + p->se.avg_overlap < sysctl_sched_migration_cost) return 1; } schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { + balanced) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1172,7 +1278,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) * More easily preempt - nice tasks, while not making it harder for * + nice tasks. */ - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + if (sched_feat(ASYM_GRAN)) + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + else + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } @@ -1234,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - se->last_wakeup = se->sum_exec_runtime; if (unlikely(se == pse)) return; @@ -1332,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - if (next == &cfs_rq->tasks) - return NULL; - - /* Skip over entities that are not tasks */ - do { + while (next != &cfs_rq->tasks) { se = list_entry(next, struct sched_entity, group_node); next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) - return NULL; + /* Skip over entities that are not tasks */ + if (entity_is_task(se)) { + p = task_of(se); + break; + } + } cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); - return p; } @@ -1395,40 +1498,32 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct task_group *tg; rcu_read_lock(); + update_h_load(busiest_cpu); + list_for_each_entry(tg, &task_groups, list) { - long imbalance; - unsigned long this_weight, busiest_weight; - long rem_load, max_load, moved_load; + struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; + u64 rem_load, moved_load; /* * empty group */ - if (!aggregate(tg, sd)->task_weight) + if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); - this_weight = tg->cfs_rq[this_cpu]->task_weight; - busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - - imbalance = (busiest_weight - this_weight) / 2; - - if (imbalance < 0) - imbalance = busiest_weight; - - max_load = max(rem_load, imbalance); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - max_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, this_best_prio, tg->cfs_rq[busiest_cpu]); if (!moved_load) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); - - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + moved_load *= busiest_h_load; + moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) @@ -1474,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -#endif +#endif /* CONFIG_SMP */ /* * scheduler tick hitting a task of our scheduling class: |