diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 2625 |
1 files changed, 2046 insertions, 579 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b3fe1cd8f4..fea7d3335e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -178,59 +178,61 @@ void sched_init_granularity(void) update_sysctl(); } -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - +#define WMULT_CONST (~0U) #define WMULT_SHIFT 32 -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +static void __update_inv_weight(struct load_weight *lw) +{ + unsigned long w; + + if (likely(lw->inv_weight)) + return; + + w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; +} /* - * delta *= weight / lw + * delta_exec * weight / lw.weight + * OR + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT + * + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * we're guaranteed shift stays positive because inv_weight is guaranteed to + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. + * + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus + * weight/lw.weight <= 1, and therefore our shift will also be positive. */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { - u64 tmp; + u64 fact = scale_load_down(weight); + int shift = WMULT_SHIFT; - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; + __update_inv_weight(lw); - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; + if (unlikely(fact >> 32)) { + while (fact >> 32) { + fact >>= 1; + shift--; + } } - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + /* hint to use a 32x32->64 mul */ + fact = (u64)(u32)fact * lw->inv_weight; - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); + while (fact >> 32) { + fact >>= 1; + shift--; + } + + return mul_u64_u32_shr(delta_exec, fact, shift); } @@ -320,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -334,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -358,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -424,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; @@ -443,7 +428,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -612,11 +597,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, /* * delta /= w */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; } @@ -665,7 +649,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&lw, se->load.weight); load = &lw; } - slice = calc_delta_mine(slice, se->load.weight, load); + slice = __calc_delta(slice, se->load.weight, load); } return slice; } @@ -681,6 +665,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static unsigned long task_h_load(struct task_struct *p); + static inline void __update_task_entity_contrib(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ @@ -701,47 +687,32 @@ void init_task_runnable_average(struct task_struct *p) #endif /* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. + * Update the current task's runtime statistics. */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); -} - static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_clock_task(rq_of(cfs_rq)); - unsigned long delta_exec; + u64 delta_exec; if (unlikely(!curr)) return; - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) return; - __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); @@ -818,11 +789,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * numa task sample period in ms + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size. */ -unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*50; -unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -830,41 +802,1057 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -static void task_numa_placement(struct task_struct *p) +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ + unsigned long rss = 0; + unsigned long nr_scan_pages; + + /* + * Calculations based on RSS as non-present and empty pages are skipped + * by the PTE scanner and NUMA hinting faults should be trapped based + * on resident pages + */ + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + rss = get_mm_rss(p->mm); + if (!rss) + rss = nr_scan_pages; + + rss = round_up(rss, nr_scan_pages); + return rss / nr_scan_pages; +} + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ + unsigned int scan, floor; + unsigned int windows = 1; + + if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + floor = 1000 / windows; + + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); + return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ + unsigned int smin = task_scan_min(p); + unsigned int smax; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + return max(smin, smax); +} + +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + struct list_head task_list; + + struct rcu_head rcu; + nodemask_t active_nodes; + unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + +pid_t task_numa_group_id(struct task_struct *p) +{ + return p->numa_group ? p->numa_group->gid : 0; +} + +static inline int task_faults_idx(int nid, int priv) +{ + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ + if (!p->numa_faults_memory) + return 0; + + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ + if (!p->numa_group) + return 0; + + return p->numa_group->faults[task_faults_idx(nid, 0)] + + p->numa_group->faults[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults_cpu[task_faults_idx(nid, 0)] + + group->faults_cpu[task_faults_idx(nid, 1)]; +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node. The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid) +{ + unsigned long total_faults; + + if (!p->numa_faults_memory) + return 0; + + total_faults = p->total_numa_faults; + + if (!total_faults) + return 0; + + return 1000 * task_faults(p, nid) / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid) +{ + if (!p->numa_group || !p->numa_group->total_faults) + return 0; + + return 1000 * group_faults(p, nid) / p->numa_group->total_faults; +} + +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + +static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long capacity_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); + +/* Cached statistics for all CPUs within a node */ +struct numa_stats { + unsigned long nr_running; + unsigned long load; + + /* Total compute capacity of CPUs on a node */ + unsigned long compute_capacity; + + /* Approximate capacity in terms of runnable tasks on a node */ + unsigned long task_capacity; + int has_free_capacity; +}; + +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct numa_stats *ns, int nid) +{ + int cpu, cpus = 0; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->nr_running += rq->nr_running; + ns->load += weighted_cpuload(cpu); + ns->compute_capacity += capacity_of(cpu); + + cpus++; + } + + /* + * If we raced with hotplug and there are no CPUs left in our mask + * the @ns structure is NULL'ed and task_numa_compare() will + * not find this node attractive. + * + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. + */ + if (!cpus) + return; + + ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); +} + +struct task_numa_env { + struct task_struct *p; + + int src_cpu, src_nid; + int dst_cpu, dst_nid; + + struct numa_stats src_stats, dst_stats; + + int imbalance_pct; + + struct task_struct *best_task; + long best_imp; + int best_cpu; +}; + +static void task_numa_assign(struct task_numa_env *env, + struct task_struct *p, long imp) +{ + if (env->best_task) + put_task_struct(env->best_task); + if (p) + get_task_struct(p); + + env->best_task = p; + env->best_imp = imp; + env->best_cpu = env->dst_cpu; +} + +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); +} + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static void task_numa_compare(struct task_numa_env *env, + long taskimp, long groupimp) +{ + struct rq *src_rq = cpu_rq(env->src_cpu); + struct rq *dst_rq = cpu_rq(env->dst_cpu); + struct task_struct *cur; + long orig_src_load, src_load; + long orig_dst_load, dst_load; + long load; + long imp = (groupimp > 0) ? groupimp : taskimp; + + rcu_read_lock(); + cur = ACCESS_ONCE(dst_rq->curr); + if (cur->pid == 0) /* idle */ + cur = NULL; + + /* + * "imp" is the fault differential for the source task between the + * source and destination node. Calculate the total differential for + * the source task and potential destination task. The more negative + * the value is, the more rmeote accesses that would be expected to + * be incurred if the tasks were swapped. + */ + if (cur) { + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) + goto unlock; + + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + /* + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. + */ + if (cur->numa_group) + imp -= imp/16; + } else { + /* + * Compare the group weights. If a task is all by + * itself (not part of a group), use the task weight + * instead. + */ + if (env->p->numa_group) + imp = groupimp; + else + imp = taskimp; + + if (cur->numa_group) + imp += group_weight(cur, env->src_nid) - + group_weight(cur, env->dst_nid); + else + imp += task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + } + } + + if (imp < env->best_imp) + goto unlock; + + if (!cur) { + /* Is there capacity at our destination? */ + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) + goto unlock; + + goto balance; + } + + /* Balance doesn't matter much if we're running a task per cpu */ + if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + goto assign; + + /* + * In the overloaded case, try and keep the load balanced. + */ +balance: + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; + + /* XXX missing capacity terms */ + load = task_h_load(env->p); + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; + + if (cur) { + load = task_h_load(cur); + dst_load -= load; + src_load += load; + } + + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) + goto unlock; + +assign: + task_numa_assign(env, cur, imp); +unlock: + rcu_read_unlock(); +} + +static void task_numa_find_cpu(struct task_numa_env *env, + long taskimp, long groupimp) +{ + int cpu; + + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { + /* Skip this CPU if the source task cannot migrate */ + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) + continue; + + env->dst_cpu = cpu; + task_numa_compare(env, taskimp, groupimp); + } +} + +static int task_numa_migrate(struct task_struct *p) +{ + struct task_numa_env env = { + .p = p, + + .src_cpu = task_cpu(p), + .src_nid = task_node(p), + + .imbalance_pct = 112, + + .best_task = NULL, + .best_imp = 0, + .best_cpu = -1 + }; + struct sched_domain *sd; + unsigned long taskweight, groupweight; + int nid, ret; + long taskimp, groupimp; + + /* + * Pick the lowest SD_NUMA domain, as that would have the smallest + * imbalance and would be the first to start moving tasks about. + * + * And we want to avoid any moving of tasks about, as that would create + * random movement of tasks -- counter the numa conditions we're trying + * to satisfy here. + */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + if (sd) + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + rcu_read_unlock(); + + /* + * Cpusets can break the scheduler domain tree into smaller + * balance domains, some of which do not cross NUMA boundaries. + * Tasks that are "trapped" in such domains cannot be migrated + * elsewhere, so there is no point in (re)trying. + */ + if (unlikely(!sd)) { + p->numa_preferred_nid = task_node(p); + return -EINVAL; + } + + taskweight = task_weight(p, env.src_nid); + groupweight = group_weight(p, env.src_nid); + update_numa_stats(&env.src_stats, env.src_nid); + env.dst_nid = p->numa_preferred_nid; + taskimp = task_weight(p, env.dst_nid) - taskweight; + groupimp = group_weight(p, env.dst_nid) - groupweight; + update_numa_stats(&env.dst_stats, env.dst_nid); + + /* If the preferred nid has free capacity, try to use it. */ + if (env.dst_stats.has_free_capacity) + task_numa_find_cpu(&env, taskimp, groupimp); + + /* No space available on the preferred nid. Look elsewhere. */ + if (env.best_cpu == -1) { + for_each_online_node(nid) { + if (nid == env.src_nid || nid == p->numa_preferred_nid) + continue; + + /* Only consider nodes where both task and groups benefit */ + taskimp = task_weight(p, nid) - taskweight; + groupimp = group_weight(p, nid) - groupweight; + if (taskimp < 0 && groupimp < 0) + continue; + + env.dst_nid = nid; + update_numa_stats(&env.dst_stats, env.dst_nid); + task_numa_find_cpu(&env, taskimp, groupimp); + } + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; + + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + + /* + * Reset the scan period if the task is being rescheduled on an + * alternative node to recheck if the tasks is now properly placed. + */ + p->numa_scan_period = task_scan_min(p); + + if (env.best_task == NULL) { + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + return ret; + } + + ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + put_task_struct(env.best_task); + return ret; +} + +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) { - int seq; + unsigned long interval = HZ; - if (!p->mm) /* for example, ksmd faulting in a user's mm */ + /* This task has no NUMA fault statistics yet */ + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; + + /* Periodically retry migrating the task to the preferred node */ + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; + + /* Success if task is already running on preferred CPU */ + if (task_node(p) == p->numa_preferred_nid) + return; + + /* Otherwise, try migrate to a CPU on the preferred node */ + task_numa_migrate(p); +} + +/* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (!node_isset(nid, numa_group->active_nodes)) { + if (faults > max_faults * 6 / 16) + node_set(nid, numa_group->active_nodes); + } else if (faults < max_faults * 3 / 16) + node_clear(nid, numa_group->active_nodes); + } +} + +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) +{ + unsigned int period_slot; + int ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is areas that are not of interest + * to automatic numa balancing. Scan slower + */ + if (local + shared == 0) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); + + return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + if (ratio >= NUMA_PERIOD_THRESHOLD) { + int slot = ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + + /* + * Scale scan rate increases based on sharing. There is an + * inverse relationship between the degree of sharing and + * the adjustment made to the scanning period. Broadly + * speaking the intent is that there is little point + * scanning faster if shared accesses dominate as it may + * simply bounce migrations uselessly + */ + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + +static void task_numa_placement(struct task_struct *p) +{ + int seq, nid, max_nid = -1, max_group_nid = -1; + unsigned long max_faults = 0, max_group_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; + spinlock_t *group_lock = NULL; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; + p->numa_scan_period_max = task_scan_max(p); + + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + + /* If the task is part of a group prevent parallel updates to group stats */ + if (p->numa_group) { + group_lock = &p->numa_group->lock; + spin_lock_irq(group_lock); + } + + /* Find the node with the highest number of faults */ + for_each_online_node(nid) { + unsigned long faults = 0, group_faults = 0; + int priv, i; + + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + long diff, f_diff, f_weight; - /* FIXME: Scheduling placement policy hints go here */ + i = task_faults_idx(nid, priv); + + /* Decay existing window, copy faults since last scan */ + diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; + + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); + f_diff = f_weight - p->numa_faults_cpu[i] / 2; + p->numa_faults_buffer_cpu[i] = 0; + + p->numa_faults_memory[i] += diff; + p->numa_faults_cpu[i] += f_diff; + faults += p->numa_faults_memory[i]; + p->total_numa_faults += diff; + if (p->numa_group) { + /* safe because we can only change our own group */ + p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; + p->numa_group->total_faults += diff; + group_faults += p->numa_group->faults[i]; + } + } + + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + + if (group_faults > max_group_faults) { + max_group_faults = group_faults; + max_group_nid = nid; + } + } + + update_task_scan_period(p, fault_types[0], fault_types[1]); + + if (p->numa_group) { + update_numa_active_node_mask(p->numa_group); + /* + * If the preferred task and group nids are different, + * iterate over the nodes again to find the best place. + */ + if (max_nid != max_group_nid) { + unsigned long weight, max_weight = 0; + + for_each_online_node(nid) { + weight = task_weight(p, nid) + group_weight(p, nid); + if (weight > max_weight) { + max_weight = weight; + max_nid = nid; + } + } + } + + spin_unlock_irq(group_lock); + } + + /* Preferred node as the node with the most faults */ + if (max_faults && max_nid != p->numa_preferred_nid) { + /* Update the preferred nid and migrate task if possible */ + sched_setnuma(p, max_nid); + numa_migrate_preferred(p); + } +} + +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void task_numa_group(struct task_struct *p, int cpupid, int flags, + int *priv) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 4*nr_node_ids*sizeof(unsigned long); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; + + node_set(task_node(current), grp->active_nodes); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] = p->numa_faults_memory[i]; + + grp->total_faults = p->total_numa_faults; + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto no_join; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto no_join; + + my_grp = p->numa_group; + if (grp == my_grp) + goto no_join; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto no_join; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto no_join; + + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* Simple filter to avoid false positives due to PID collisions */ + if (flags & TNF_SHARED) + join = true; + + /* Update priv based on whether false sharing was detected */ + *priv = !join; + + if (join && !get_numa_group(grp)) + goto no_join; + + rcu_read_unlock(); + + if (!join) + return; + + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; + } + my_grp->total_faults -= p->total_numa_faults; + grp->total_faults += p->total_numa_faults; + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock_irq(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); + return; + +no_join: + rcu_read_unlock(); + return; +} + +void task_numa_free(struct task_struct *p) +{ + struct numa_group *grp = p->numa_group; + void *numa_faults = p->numa_faults_memory; + unsigned long flags; + int i; + + if (grp) { + spin_lock_irqsave(&grp->lock, flags); + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] -= p->numa_faults_memory[i]; + grp->total_faults -= p->total_numa_faults; + + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock_irqrestore(&grp->lock, flags); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; + kfree(numa_faults); } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { struct task_struct *p = current; + bool migrated = flags & TNF_MIGRATED; + int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); + int priv; if (!numabalancing_enabled) return; - /* FIXME: Allocate task-specific structure for placement policy here */ + /* for example, ksmd faulting in a user's mm */ + if (!p->mm) + return; + + /* Do not worry about placement if exiting */ + if (p->state == TASK_DEAD) + return; + + /* Allocate buffer to track faults on a per-node basis */ + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; + + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) + return; + + BUG_ON(p->numa_faults_buffer_memory); + /* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); + p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); + } /* - * If pages are properly placed (did not migrate) then scan slower. - * This is reset periodically in case of phase changes + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed */ - if (!migrated) - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(10)); + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv && !(flags & TNF_NO_GROUP)) + task_numa_group(p, last_cpupid, flags, &priv); + } + + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; task_numa_placement(p); + + /* + * Retry task to preferred node migration periodically, in case it + * case it previously failed, or the scheduler moved us. + */ + if (time_after(jiffies, p->numa_migrate_retry)) + numa_migrate_preferred(p); + + if (migrated) + p->numa_pages_migrated += pages; + + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -884,6 +1872,7 @@ void task_numa_work(struct callback_head *work) struct mm_struct *mm = p->mm; struct vm_area_struct *vma; unsigned long start, end; + unsigned long nr_pte_updates = 0; long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -900,35 +1889,9 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; - /* - * We do not care about task placement until a task runs on a node - * other than the first one used by the address space. This is - * largely because migrations are driven by what CPU the task - * is running on. If it's never scheduled on another node, it'll - * not migrate so why bother trapping the fault. - */ - if (mm->first_nid == NUMA_PTE_SCAN_INIT) - mm->first_nid = numa_node_id(); - if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { - /* Are we running on a new node yet? */ - if (numa_node_id() == mm->first_nid && - !sched_feat_numa(NUMA_FORCE)) - return; - - mm->first_nid = NUMA_PTE_SCAN_ACTIVE; - } - - /* - * Reset the scan period if enough time has gone by. Objective is that - * scanning will be reduced if pages are properly placed. As tasks - * can enter different phases this needs to be re-examined. Lacking - * proper tracking of reference behaviour, this blunt hammer is used. - */ - migrate = mm->numa_next_reset; - if (time_after(now, migrate)) { - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; - next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); - xchg(&mm->numa_next_reset, next_scan); + if (!mm->numa_next_scan) { + mm->numa_next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); } /* @@ -938,20 +1901,20 @@ void task_numa_work(struct callback_head *work) if (time_before(now, migrate)) return; - if (p->numa_scan_period == 0) - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + if (p->numa_scan_period == 0) { + p->numa_scan_period_max = task_scan_max(p); + p->numa_scan_period = task_scan_min(p); + } next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; /* - * Do not set pte_numa if the current running node is rate-limited. - * This loses statistics on the fault but if we are unwilling to - * migrate to this node, it is less likely we can do useful work + * Delay this task enough that another task of this mm will likely win + * the next time around. */ - if (migrate_ratelimited(numa_node_id())) - return; + p->node_stamp += 2 * TICK_NSEC; start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; @@ -967,31 +1930,54 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) continue; - /* Skip small VMAs. They are not likely to be of relevance */ - if (vma->vm_end - vma->vm_start < HPAGE_SIZE) + /* + * Shared library pages mapped by multiple processes are not + * migrated as it is expected they are cache replicated. Avoid + * hinting faults in read-only file-backed mappings or the vdso + * as migrating the pages will be of marginal benefit. + */ + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + continue; + + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) continue; do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - pages -= change_prot_numa(vma, start, end); + nr_pte_updates += change_prot_numa(vma, start, end); + + /* + * Scan sysctl_numa_balancing_scan_size but ensure that + * at least one PTE is updated so that unused virtual + * address space is quickly skipped. + */ + if (nr_pte_updates) + pages -= (end - start) >> PAGE_SHIFT; start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } out: /* - * It is possible to reach the end of the VMA list but the last few VMAs are - * not guaranteed to the vma_migratable. If they are not, we would find the - * !migratable VMA on the next scan but not reset the scanner to the start - * so check it now. + * It is possible to reach the end of the VMA list but the last few + * VMAs are not guaranteed to the vma_migratable. If they are not, we + * would find the !migratable VMA on the next scan but not reset the + * scanner to the start so check it now. */ if (vma) mm->numa_scan_offset = start; @@ -1025,8 +2011,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now - curr->node_stamp > period) { if (!curr->node_stamp) - curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; - curr->node_stamp = now; + curr->numa_scan_period = task_scan_min(curr); + curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ @@ -1038,6 +2024,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } + +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -1047,8 +2041,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP - if (entity_is_task(se)) - list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + if (entity_is_task(se)) { + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } #endif cfs_rq->nr_running++; } @@ -1059,8 +2057,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); + } cfs_rq->nr_running--; } @@ -1378,7 +2378,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, long contrib; /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; @@ -1429,13 +2429,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) se->avg.load_avg_contrib >>= NICE_0_SHIFT; } } -#else + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -#endif +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +#endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) { @@ -1533,12 +2540,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} - /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -1572,13 +2573,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } wakeup = 0; } else { - /* - * Task re-woke on same cpu (or else migrate_task_rq_fair() - * would have made count negative); we must be careful to avoid - * double-accounting blocked time after synchronizing decays. - */ - se->avg.last_runnable_update += __synchronize_entity_decay(se) - << 20; + __synchronize_entity_decay(se); } /* migrated tasks did not contribute to our blocked load */ @@ -1632,7 +2627,10 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } -#else +static int idle_balance(struct rq *this_rq); + +#else /* CONFIG_SMP */ + static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -1644,7 +2642,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} -#endif + +static inline int idle_balance(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -1794,10 +2798,10 @@ static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else + if (cfs_rq->last != se) break; + + cfs_rq->last = NULL; } } @@ -1805,10 +2809,10 @@ static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else + if (cfs_rq->next != se) break; + + cfs_rq->next = NULL; } } @@ -1816,10 +2820,10 @@ static void __clear_buddies_skip(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else + if (cfs_rq->skip != se) break; + + cfs_rq->skip = NULL; } } @@ -1962,17 +2966,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *se; + + /* + * If curr is set we have to see if its left of the leftmost entity + * still in the tree, provided there was anything in the tree at all. + */ + if (!left || (curr && entity_before(curr, left))) + left = curr; + + se = left; /* ideally we run the leftmost entity */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); + struct sched_entity *second; + + if (se == curr) { + second = __pick_first_entity(cfs_rq); + } else { + second = __pick_next_entity(se); + if (!second || (curr && entity_before(curr, second))) + second = curr; + } + if (second && wakeup_preempt_entity(second, left) < 1) se = second; } @@ -1994,7 +3017,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -2070,13 +3093,14 @@ static inline bool cfs_bandwidth_used(void) return static_key_false(&__cfs_bandwidth_used); } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_inc(void) +{ + static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void) { - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - static_key_slow_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2084,7 +3108,8 @@ static bool cfs_bandwidth_used(void) return true; } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {} #endif /* HAVE_JUMP_LABEL */ /* @@ -2156,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -2201,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -2213,8 +3240,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) } } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; @@ -2232,7 +3258,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -2329,12 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -2378,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -2432,28 +3460,35 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; + + /* + * if we have relooped after returning idle once, we need to update our + * status as actually running, so that other cpus doing + * __start_cfs_bandwidth will stop trying to cancel us. + */ + cfs_b->timer_active = 1; __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ @@ -2493,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -2508,7 +3543,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -2584,10 +3625,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + raw_spin_lock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { + raw_spin_unlock(&cfs_b->lock); return; + } - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; @@ -2631,22 +3674,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { if (!cfs_bandwidth_used()) - return; + return false; if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; + return false; /* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished. */ if (cfs_rq_throttled(cfs_rq)) - return; + return true; throttle_cfs_rq(cfs_rq); + return true; } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -2666,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -2675,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -2700,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -2708,14 +3754,14 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + while (unlikely(hrtimer_active(&cfs_b->period_timer)) && + hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { + /* bounce the lock to allow do_sched_cfs_period_timer to run */ raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - + cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } @@ -2734,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) continue; @@ -2743,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -2755,9 +3799,8 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)); } -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2895,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -2955,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); @@ -3001,9 +4044,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) @@ -3025,8 +4068,8 @@ static void record_wakee(struct task_struct *p) * about the boundary, really active task won't care * about the loss. */ - if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } @@ -3166,8 +4209,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { return wl; } @@ -3247,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -3292,12 +4334,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) + int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + do { unsigned long load, avg_load; int local_group; @@ -3324,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -3409,22 +4455,22 @@ done: } /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number. * * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; @@ -3457,16 +4503,15 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } while (sd) { - int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -3475,10 +4520,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) continue; } - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -3532,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ @@ -3695,26 +4740,124 @@ preempt: set_last_buddy(se); } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; + struct task_struct *p; + int new_tasks; +again: +#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; + + if (prev->sched_class != &fair_sched_class) + goto simple; + + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. + */ do { - se = pick_next_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + + /* + * Since we got here without doing put_prev_entity() we also + * have to consider cfs_rq->curr. If it is still a runnable + * entity, update_curr() will update its vruntime, otherwise + * forget we've ever seen it. + */ + if (curr && curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + /* + * This call to check_cfs_rq_runtime() will do the throttle and + * dequeue its entity in the parent(s). Therefore the 'simple' + * nr_running test will indeed be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + /* + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; + int pse_depth = pse->depth; + + if (se_depth <= pse_depth) { + put_prev_entity(cfs_rq_of(pse), pse); + pse = parent_entity(pse); + } + if (se_depth >= pse_depth) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + } + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); + } + + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +simple: + cfs_rq = &rq->cfs; +#endif + + if (!cfs_rq->nr_running) + goto idle; + + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); return p; + +idle: + new_tasks = idle_balance(rq); + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + if (new_tasks < 0) + return RETRY_TASK; + + if (new_tasks > 0) + goto again; + + return NULL; } /* @@ -3808,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * - * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get @@ -3904,9 +5047,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; +enum fbq_type { regular, remote, all }; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED 0x04 +#define LBF_SOME_PINNED 0x08 struct lb_env { struct sched_domain *sd; @@ -3929,6 +5075,8 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; + + enum fbq_type fbq_type; }; /* @@ -3947,7 +5095,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -3975,6 +5123,94 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ + struct numa_group *numa_group = rcu_dereference(p->numa_group); + int src_nid, dst_nid; + + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + !(env->sd->flags & SD_NUMA)) { + return false; + } + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid) + return false; + + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return true; + + return task_faults(p, dst_nid) > task_faults(p, src_nid); +} + + +static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +{ + struct numa_group *numa_group = rcu_dereference(p->numa_group); + int src_nid, dst_nid; + + if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) + return false; + + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + return false; + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid) + return false; + + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) + return true; + + return task_faults(p, dst_nid) < task_faults(p, src_nid); +} + +#else +static inline bool migrate_improves_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} + +static inline bool migrate_degrades_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} +#endif + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -3997,6 +5233,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + env->flags |= LBF_SOME_PINNED; + /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +5243,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_SOME_PINNED; + env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; } @@ -4030,11 +5268,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. + * 1) destination numa is preferred + * 2) task is cache cold, or + * 3) too many balance attempts have failed. */ + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + if (!tsk_cache_hot) + tsk_cache_hot = migrate_degrades_locality(p, env); + + if (migrate_improves_locality(p, env)) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4077,8 +5328,6 @@ static int move_one_task(struct lb_env *env) return 0; } -static unsigned long task_h_load(struct task_struct *p); - static const unsigned int sched_nr_migrate_break = 32; /* @@ -4242,7 +5491,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->h_load = cfs_rq->runnable_load_avg; cfs_rq->last_h_load_update = now; } @@ -4284,13 +5533,17 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_power; + unsigned long group_capacity; unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity; + unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif }; /* @@ -4301,7 +5554,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -4320,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .busiest = NULL, .local = NULL, .total_load = 0UL, - .total_pwr = 0UL, + .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, }, @@ -4330,7 +5583,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * @idle: The idle status of the CPU for whose sd load_idx is obtained. * * Return: The load index. */ @@ -4355,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - return SCHED_POWER_SCALE; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { - return default_scale_freq_power(sd, cpu); + return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -4375,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -4392,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { - /* Ensures that power won't end up being negative */ + /* Ensures that capacity won't end up being negative */ available = 0; } else { available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_smt_capacity(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_freq_capacity(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_CAPACITY_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; + sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } - power = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -4467,8 +5725,33 @@ void update_group_power(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) - power += power_of(cpu); + for_each_cpu(cpu, sched_group_cpus(sdg)) { + struct sched_group_capacity *sgc; + struct rq *rq = cpu_rq(cpu); + + /* + * build_sched_domains() -> init_sched_groups_capacity() + * gets here before we've attached the domains to the + * runqueues. + * + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). + * + * This avoids capacity/capacity_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct capacity_orig. + */ + if (unlikely(!rq->sd)) { + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); + continue; + } + + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; + } } else { /* * !SD_OVERLAP domains can assume that child groups @@ -4477,12 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power += group->sgp->power; + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = sdg->sgp->power = power; + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -4496,15 +5781,15 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; @@ -4526,13 +5811,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * update_sd_pick_busiest(). And calculate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -4540,49 +5824,37 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * subtle and fragile situation. */ -struct sg_imb_stats { - unsigned long max_nr_running, min_nr_running; - unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group) { - sgi->max_cpu_load = sgi->max_nr_running = 0UL; - sgi->min_cpu_load = sgi->min_nr_running = ~0UL; + return group->sgc->imbalance; } -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, - unsigned long load, unsigned long nr_running) +/* + * Compute the group capacity factor. + * + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit unit capacity with that. + */ +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { - if (load > sgi->max_cpu_load) - sgi->max_cpu_load = load; - if (sgi->min_cpu_load > load) - sgi->min_cpu_load = load; + unsigned int capacity_factor, smt, cpus; + unsigned int capacity, capacity_orig; - if (nr_running > sgi->max_nr_running) - sgi->max_nr_running = nr_running; - if (sgi->min_nr_running > nr_running) - sgi->min_nr_running = nr_running; -} + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; + cpus = group->group_weight; -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) -{ - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && - (sgi->max_nr_running - sgi->min_nr_running) > 1) - return 1; + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); + capacity_factor = cpus / smt; /* cores */ - return 0; + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); + + return capacity_factor; } /** @@ -4597,56 +5869,45 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - struct sg_imb_stats sgi; - unsigned long nr_running; unsigned long load; int i; - init_sg_imb_stats(&sgi); + memset(sgs, 0, sizeof(*sgs)); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); - nr_running = rq->nr_running; - /* Bias balancing toward cpus of our domain */ - if (local_group) { + if (local_group) load = target_load(i, load_idx); - } else { + else load = source_load(i, load_idx); - update_sg_imb_stats(&sgi, load, nr_running); - } sgs->group_load += load; - sgs->sum_nr_running += nr_running; + sgs->sum_nr_running += rq->nr_running; +#ifdef CONFIG_NUMA_BALANCING + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; +#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; } - if (local_group && (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, group->sgp->next_update))) - update_group_power(env->sd, env->dst_cpu); - - /* Adjust by relative CPU power of the group */ - sgs->group_power = group->sgp->power; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_imb = sg_imbalanced(sgs, &sgi); - - sgs->group_capacity = - DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - - if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(env->sd, group); - sgs->group_weight = group->group_weight; - if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + sgs->group_imb = sg_imbalanced(group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); + + if (sgs->group_capacity_factor > sgs->sum_nr_running) + sgs->group_has_free_capacity = 1; } /** @@ -4670,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -4693,14 +5954,42 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; } +#ifdef CONFIG_NUMA_BALANCING +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->nr_numa_running) + return regular; + if (sgs->sum_nr_running > sgs->nr_preferred_running) + return remote; + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + if (rq->nr_running > rq->nr_numa_running) + return regular; + if (rq->nr_running > rq->nr_preferred_running) + return remote; + return all; +} +#else +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + return regular; +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct lb_env *env, - struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -4720,36 +6009,46 @@ static inline void update_sd_lb_stats(struct lb_env *env, if (local_group) { sds->local = sg; sgs = &sds->local_stat; + + if (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); } - memset(sgs, 0, sizeof(*sgs)); update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + if (local_group) + goto next_group; + /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && - sds->local && sds->local_stat.group_has_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1U); - - /* Now, start updating sd_lb_stats */ - sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; + if (prefer_sibling && sds->local && + sds->local_stat.group_has_free_capacity) + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } +next_group: + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_capacity += sgs->group_capacity; + sg = sg->next; } while (sg != env->sd->groups); + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); } /** @@ -4790,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_power, - SCHED_POWER_SCALE); + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, + SCHED_CAPACITY_SCALE); return 1; } @@ -4806,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; @@ -4820,51 +6119,49 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) imbn = 1; scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + busiest->group_capacity; - if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { + if (busiest->avg_load + scaled_busy_load_per_task >= + local->avg_load + (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ - pwr_now += busiest->group_power * + capa_now += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load); - pwr_now += local->group_power * + capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; - if (busiest->avg_load > tmp) { - pwr_move += busiest->group_power * + if (busiest->avg_load > scaled_busy_load_per_task) { + capa_move += busiest->group_capacity * min(busiest->load_per_task, - busiest->avg_load - tmp); + busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_power < - busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * busiest->group_power) / - local->group_power; + if (busiest->avg_load * busiest->group_capacity < + busiest->load_per_task * SCHED_CAPACITY_SCALE) { + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; } else { - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - local->group_power; + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + local->group_capacity; } - pwr_move += local->group_power * + capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) + if (capa_move > capa_now) env->imbalance = busiest->load_per_task; } @@ -4894,9 +6191,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ - if (busiest->avg_load < sds->avg_load) { + if (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -4908,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity); + (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= busiest->group_power; + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); + load_above_capacity /= busiest->group_capacity; } /* @@ -4926,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * busiest->group_power, - (sds->avg_load - local->avg_load) * local->group_power - ) / SCHED_POWER_SCALE; + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -4982,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -4993,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && - !busiest->group_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* @@ -5048,42 +6347,66 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_power = 1; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, - SCHED_POWER_SCALE); - unsigned long wl; - - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + unsigned long capacity, capacity_factor, wl; + enum fbq_type rt; rq = cpu_rq(i); + rt = fbq_classify_rq(rq); + + /* + * We classify groups/runqueues into three groups: + * - regular: there are !numa tasks + * - remote: there are numa tasks that run on the 'wrong' node + * - all: there is no distinction + * + * In order to avoid migrating ideally placed numa tasks, + * ignore those when there's better options. + * + * If we ignore the actual busiest queue to migrate another + * task, the next balance pass can still reduce the busiest + * queue by moving tasks around inside the node. + * + * If we cannot move enough load due to this classification + * the next pass will adjust the group classification and + * allow migration of more tasks. + * + * Both cases only affect the total convergence complexity. + */ + if (rt > env->fbq_type) + continue; + + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); + wl = weighted_cpuload(i); /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ - if (capacity && rq->nr_running == 1 && wl > env->imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / power_i), crosswise + * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * power_j > wl_j * power_i; where j is our - * previous maximum. + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - if (wl * busiest_power > busiest_load * power) { + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; - busiest_power = power; + busiest_capacity = capacity; busiest = rq; } } @@ -5163,6 +6486,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5176,6 +6500,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, + .fbq_type = all, }; /* @@ -5267,17 +6592,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_SOME_PINNED; + env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; - /* Prevent to re-select dst_cpu via env's cpus */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); - /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5285,6 +6610,18 @@ more_balance: goto more_balance; } + /* + * We failed to reach balance because of affinity. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + *group_imbalance = 1; + } else if (*group_imbalance) + *group_imbalance = 0; + } + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5383,20 +6720,62 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; + u64 curr_cost = 0; + idle_enter_fair(this_rq); + + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -5406,38 +6785,70 @@ void idle_balance(int this_cpu, struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; + u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); + break; + } + if (sd->flags & SD_BALANCE_NEWIDLE) { - /* If we've pulled tasks over stop searching: */ + t0 = sched_clock_cpu(this_cpu); + pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + update_next_balance(sd, 0, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; - } } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); + this_rq->idle_stamp = 0; } + + return pulled_task; } /* @@ -5508,6 +6919,11 @@ out_unlock: return 0; } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -5521,7 +6937,7 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(int call_cpu) +static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -5536,13 +6952,13 @@ static inline int find_new_ilb(int call_cpu) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; @@ -5562,8 +6978,13 @@ static void nohz_balancer_kick(int cpu) static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -5571,16 +6992,16 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; + int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq()->sd); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - for (; sd; sd = sd->parent) - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -5588,16 +7009,16 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; + int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq()->sd); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - for (; sd; sd = sd->parent) - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -5617,6 +7038,12 @@ void nohz_balance_enter_idle(int cpu) if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; + /* + * If we're a completely isolated CPU, we don't play. + */ + if (on_null_domain(cpu_rq(cpu))) + return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); @@ -5652,34 +7079,51 @@ void update_max_interval(void) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; - struct rq *rq = cpu_rq(cpu); + int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize; + int need_serialize, need_decay = 0; + u64 max_cost = 0; update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + if (!(sd->flags & SD_LOAD_BALANCE)) continue; - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -5688,13 +7132,14 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * The LBF_SOME_PINNED logic could have changed + * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); @@ -5703,14 +7148,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - + } + if (need_decay) { /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. */ - if (!continue_balancing) - break; + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); } rcu_read_unlock(); @@ -5728,9 +7173,9 @@ out: * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - struct rq *this_rq = cpu_rq(this_cpu); + int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; @@ -5752,12 +7197,17 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); - update_idle_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - - rebalance_domains(balance_cpu, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -5772,16 +7222,18 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; + struct sched_group_capacity *sgc; + int nr_busy, cpu = rq->cpu; - if (unlikely(idle_cpu(cpu))) + if (unlikely(rq->idle_balance)) return 0; /* @@ -5805,22 +7257,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) goto need_kick; rcu_read_lock(); - for_each_domain(cpu, sd) { - struct sched_group *sg = sd->groups; - struct sched_group_power *sgp = sg->sgp; - int nr_busy = atomic_read(&sgp->nr_busy_cpus); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) - goto need_kick_unlock; + if (sd) { + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight - && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) + if (nr_busy > 1) goto need_kick_unlock; - - if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) - break; } + + sd = rcu_dereference(per_cpu(sd_asym, cpu)); + + if (sd && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + rcu_read_unlock(); return 0; @@ -5830,7 +7282,7 @@ need_kick: return 1; } #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif /* @@ -5839,38 +7291,34 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); + struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(this_rq, idle); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - nohz_idle_balance(this_cpu, idle); -} - -static inline int on_null_domain(int cpu) -{ - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + nohz_idle_balance(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq) { /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) + if (unlikely(on_null_domain(rq))) + return; + + if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) - nohz_balancer_kick(cpu); + if (nohz_kick_needed(rq)) + nohz_balancer_kick(); #endif } @@ -5928,11 +7376,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); @@ -5982,15 +7434,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -6017,7 +7469,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - if (!p->se.on_rq) + struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + if (!se->on_rq) return; /* @@ -6065,7 +7525,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -6091,23 +7553,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; if (!on_rq) { - cfs_rq = cfs_rq_of(&p->se); - p->se.vruntime += cfs_rq->min_vruntime; + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP /* * migrate_task_rq_fair() will have removed our previous * contribution, but we must synchronize for ongoing future * decay. */ - p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; #endif } } @@ -6203,13 +7666,17 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; - update_load_set(&se->load, 0); + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; } |
