diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 463 | ||||
-rw-r--r-- | kernel/sched/debug.c | 12 | ||||
-rw-r--r-- | kernel/sched/fair.c | 472 | ||||
-rw-r--r-- | kernel/sched/features.h | 1 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 56 | ||||
-rw-r--r-- | kernel/sched/sched.h | 8 |
8 files changed, 384 insertions, 632 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a..173ea52f3af 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o - - diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b189fecaef9..39eb6011bc3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,6 +83,7 @@ #include "sched.h" #include "../workqueue_sched.h" +#include "../smpboot.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data) } #endif -void update_cpu_load(struct rq *this_rq); - static void set_load_weight(struct task_struct *p) { int prio = p->static_prio - MAX_RT_PRIO; @@ -2083,6 +2082,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); @@ -2486,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -void update_cpu_load(struct rq *this_rq) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long this_load = this_rq->load.weight; - unsigned long curr_jiffies = jiffies; - unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; - /* Avoid repeated calls on same jiffy, when moving in and out of idle */ - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2526,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq) sched_avg_update(this_rq); } +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = jiffies; + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * Bloody broken means of dealing with nohz, but better than nothing.. + * jiffies is updated by one cpu, another cpu can drift wrt the jiffy + * update and see 0 difference the one time and 2 the next, even though + * we ticked at roughtly the same rate. + * + * Hence we only use this from nohz_idle_balance() and skip this + * nonsense when called from the scheduler_tick() since that's + * guaranteed a stable rate. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from scheduler_tick() + */ static void update_cpu_load_active(struct rq *this_rq) { - update_cpu_load(this_rq); + /* + * See the mess in update_idle_cpu_load(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); calc_load_account_active(this_rq); } @@ -3113,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); + add_taint(TAINT_WARN); } /* @@ -5557,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + if (!(sd->flags & SD_OVERLAP) && + cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; @@ -5895,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = -1; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node != -1) - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ - nodemask_t used_nodes; - int i; - - cpumask_clear(span); - nodes_clear(used_nodes); - - cpumask_or(span, span, cpumask_of_node(node)); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - if (next_node < 0) - break; - cpumask_or(span, span, cpumask_of_node(next_node)); - } -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ - lockdep_assert_held(&sched_domains_mutex); - - sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - - return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ - return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ - static const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; @@ -6017,6 +5958,7 @@ struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; int flags; + int numa_level; struct sd_data data; }; @@ -6208,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ } SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif #ifdef CONFIG_SCHED_SMT SD_INIT_FUNC(SIBLING) #endif @@ -6333,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = { { sd_init_BOOK, cpu_book_mask, }, #endif { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, - { sd_init_ALLNODES, cpu_allnodes_mask, }, -#endif { NULL, }, }; static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_NUMA + +static int sched_domains_numa_levels; +static int sched_domains_numa_scale; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; + +static inline int sd_local_flags(int level) +{ + if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + return 0; + + return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; +} + +static struct sched_domain * +sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int level = tl->numa_level; + int sd_weight = cpumask_weight( + sched_domains_numa_masks[level][cpu_to_node(cpu)]); + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 125, + .cache_nice_tries = 2, + .busy_idx = 3, + .idle_idx = 2, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 0*SD_BALANCE_EXEC + | 0*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 0*SD_WAKE_AFFINE + | 0*SD_PREFER_LOCAL + | 0*SD_SHARE_CPUPOWER + | 0*SD_SHARE_PKG_RESOURCES + | 1*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | sd_local_flags(level) + , + .last_balance = jiffies, + .balance_interval = sd_weight, + }; + SD_INIT_NAME(sd, NUMA); + sd->private = &tl->data; + + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; + + return sd; +} + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_scale = curr_distance; + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + * + * XXX: could be optimized to O(n log n) by using sort() + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + int distance = node_distance(0, j); + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_nume_distance[] array includes the actual distance + * numbers. + */ + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * cpus of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for (k = 0; k < nr_node_ids; k++) { + if (node_distance(j, k) > sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; default_topology[i].init; i++) + tl[i] = default_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .init = sd_numa_init, + .mask = sd_numa_mask, + .flags = SDTL_OVERLAP, + .numa_level = j, + }; + } + + sched_domain_topology = tl; +} +#else +static inline void sched_init_numa(void) +{ +} +#endif /* CONFIG_NUMA */ + static int __sdt_alloc(const struct cpumask *cpu_map) { struct sched_domain_topology_level *tl; @@ -6379,6 +6486,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sg) return -ENOMEM; + sg->next = sg; + *per_cpu_ptr(sdd->sg, j) = sg; sgp = kzalloc_node(sizeof(struct sched_group_power), @@ -6402,16 +6511,26 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgp) + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); + sdd->sd = NULL; free_percpu(sdd->sg); + sdd->sg = NULL; free_percpu(sdd->sgp); + sdd->sgp = NULL; } } @@ -6697,97 +6816,6 @@ match2: mutex_unlock(&sched_domains_mutex); } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - unsigned int level = 0; - - if (sscanf(buf, "%u", &level) != 1) - return -EINVAL; - - /* - * level is always be positive so don't check for - * level < POWERSAVINGS_BALANCE_NONE which is 0 - * What happens on 0 or 1 byte write, - * need to check for count as well? - */ - - if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) - return -EINVAL; - - if (smt) - sched_smt_power_savings = level; - else - sched_mc_power_savings = level; - - reinit_sched_domains(); - - return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper @@ -6825,6 +6853,8 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + sched_init_numa(); + get_online_cpus(); mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); @@ -7046,6 +7076,7 @@ void __init sched_init(void) /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + idle_thread_set_boot_cpu(); #endif init_sched_fair_class(); @@ -7967,13 +7998,9 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif + { } /* terminate */ }; -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} - struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, @@ -7981,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, + .base_cftypes = cpu_files, .early_init = 1, }; @@ -8167,13 +8194,9 @@ static struct cftype files[] = { .name = "stat", .read_map = cpuacct_stats_show, }, + { } /* terminate */ }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); -} - /* * charge this task's execution time to its accounting group. * @@ -8205,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, .destroy = cpuacct_destroy, - .populate = cpuacct_populate, .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161..6f79596e0ea 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "\ncpu#%d\n", cpu); #endif -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define P(x) \ +do { \ + if (sizeof(rq->x) == 4) \ + SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + else \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ +} while (0) + #define PN(x) \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f..940e6d17cf9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. */ - if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + if (tmp->flags & (SD_PREFER_LOCAL)) { unsigned long power = 0; unsigned long nr_running = 0; unsigned long capacity; @@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - nr_running /= 2; - if (nr_running < capacity) want_sd = 0; } @@ -3082,7 +3079,7 @@ struct lb_env { struct rq *dst_rq; enum cpu_idle_type idle; - long load_move; + long imbalance; unsigned int flags; unsigned int loop; @@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); +static const unsigned int sched_nr_migrate_break = 32; + /* - * move_tasks tries to move up to load_move weighted load from busiest to + * move_tasks tries to move up to imbalance weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". * Returns 1 if successful and 0 otherwise. * @@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env) unsigned long load; int pulled = 0; - if (env->load_move <= 0) + if (env->imbalance <= 0) return 0; while (!list_empty(tasks)) { @@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sysctl_sched_nr_migrate; + env->loop_break += sched_nr_migrate_break; env->flags |= LBF_NEED_BREAK; break; } @@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env) load = task_h_load(p); - if (load < 16 && !env->sd->nr_balance_failed) + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load / 2) > env->load_move) + if ((load / 2) > env->imbalance) goto next; if (!can_migrate_task(p, env)) @@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env) move_task(p, env); pulled++; - env->load_move -= load; + env->imbalance -= load; #ifdef CONFIG_PREEMPT /* @@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env) * We only want to steal up to the prescribed amount of * weighted load. */ - if (env->load_move <= 0) + if (env->imbalance <= 0) break; continue; @@ -3433,14 +3432,6 @@ struct sd_lb_stats { unsigned int busiest_group_weight; int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance; /* Is powersave balance needed for this sd */ - struct sched_group *group_min; /* Least loaded group in sd */ - struct sched_group *group_leader; /* Group which relieves group_min */ - unsigned long min_load_per_task; /* load_per_task in group_min */ - unsigned long leader_nr_running; /* Nr running of group_leader */ - unsigned long min_nr_running; /* Nr running of group_min */ -#endif }; /* @@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - - if (!sds->power_savings_balance) - return; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (sds->this_nr_running >= sgs->group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs->sum_nr_running >= sgs->group_capacity || - !sgs->sum_nr_running) - return; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs->sum_nr_running < sds->min_nr_running) || - (sgs->sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs->sum_nr_running; - sds->min_load_per_task = sgs->sum_weighted_load / - sgs->sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs->sum_nr_running + 1 > sgs->group_capacity) - return; - - if (sgs->sum_nr_running > sds->leader_nr_running || - (sgs->sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs->sum_nr_running; - } -} - -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - * under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - if (!sds->power_savings_balance) - return 0; - - if (sds->this != sds->group_leader || - sds->group_leader == sds->group_min) - return 0; - - *imbalance = sds->min_load_per_task; - sds->busiest = sds->group_min; - - return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - return; -} - -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - - unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; @@ -3763,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_domain *sd, - struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, +static inline void update_sg_lb_stats(struct lb_env *env, + struct sched_group *group, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; - int i; + unsigned long nr_running, max_nr_running, min_nr_running; + unsigned long load, max_cpu_load, min_cpu_load; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; + int i; if (local_group) balance_cpu = group_first_cpu(group); @@ -3789,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, max_cpu_load = 0; min_cpu_load = ~0UL; max_nr_running = 0; + min_nr_running = ~0UL; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); + nr_running = rq->nr_running; + /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -3803,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) { + if (load > max_cpu_load) max_cpu_load = load; - max_nr_running = rq->nr_running; - } if (min_cpu_load > load) min_cpu_load = load; + + if (nr_running > max_nr_running) + max_nr_running = nr_running; + if (min_nr_running > nr_running) + min_nr_running = nr_running; } sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->sum_nr_running += nr_running; sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; @@ -3825,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * to do the newly idle load balance. */ if (local_group) { - if (idle != CPU_NEWLY_IDLE) { - if (balance_cpu != this_cpu) { + if (env->idle != CPU_NEWLY_IDLE) { + if (balance_cpu != env->dst_cpu) { *balance = 0; return; } - update_group_power(sd, this_cpu); + update_group_power(env->sd, env->dst_cpu); } else if (time_after_eq(jiffies, group->sgp->next_update)) - update_group_power(sd, this_cpu); + update_group_power(env->sd, env->dst_cpu); } /* Adjust by relative CPU power of the group */ @@ -3850,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && + (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(sd, group); + sgs->group_capacity = fix_small_capacity(env->sd, group); sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) @@ -3874,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * Determine if @sg is a busier group than the previously selected * busiest group. */ -static bool update_sd_pick_busiest(struct sched_domain *sd, +static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, - struct sg_lb_stats *sgs, - int this_cpu) + struct sg_lb_stats *sgs) { if (sgs->avg_load <= sds->max_load) return false; @@ -3894,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - this_cpu < group_first_cpu(sg)) { + if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return tr |