From cd62287e364c0d15d517c6ced4e4808b54711475 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 4 Jun 2011 15:03:20 +0200 Subject: sched, cgroups: Fix MIN_SHARES on 64-bit boxen Commit c8b28116 ("sched: Increase SCHED_LOAD_SCALE resolution") intended to have no user-visible effect, but allows setting cpu.shares to < MIN_SHARES, which the user then sees. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Link: http://lkml.kernel.org/r/1307192600.8618.3.camel@marge.simson.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609..9769c756ad6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -292,8 +292,8 @@ static DEFINE_SPINLOCK(task_group_lock); * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES 2 -#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -8450,10 +8450,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); mutex_lock(&shares_mutex); if (tg->shares == shares) -- cgit v1.2.3-18-g5258 From c64be78ffb415278d7d32d6f55de95c73dcc19a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jul 2011 16:28:50 +0200 Subject: sched: Fix 32bit race Commit 3fe1698b7fe0 ("sched: Deal with non-atomic min_vruntime reads on 32bit") forgot to initialize min_vruntime_copy which could lead to an infinite while loop in task_waking_fair() under some circumstances (early boot, lucky timing). [ This bug was also reported by others that blamed it on the RCU initialization problems ] Reported-and-tested-by: Bruno Wolff III Signed-off-by: Peter Zijlstra Reviewed-by: Paul E. McKenney Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9769c756ad6..3dc716f6d8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7757,6 +7757,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -- cgit v1.2.3-18-g5258 From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2011 13:00:06 +0200 Subject: sched: Break out cpu_power from the sched_group structure In order to prepare for non-unique sched_groups per domain, we need to carry the cpu_power elsewhere, so put a level of indirection in. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3dc716f6d8a..36c10d25d4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6777,8 +6777,10 @@ static struct root_domain *alloc_rootdomain(void) static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6945,6 +6947,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6981,8 +6984,10 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) + if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + } return cpu; } @@ -7020,7 +7025,7 @@ build_sched_groups(struct sched_domain *sd) continue; cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; + sg->sgp->power = 0; for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -7185,6 +7190,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (cpu == cpumask_first(sched_group_cpus(sg))) { WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); *per_cpu_ptr(sdd->sg, cpu) = NULL; + *per_cpu_ptr(sdd->sgp, cpu) = NULL; } } @@ -7234,9 +7240,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; + struct sched_group_power *sgp; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -7251,6 +7262,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) return -ENOMEM; *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; } } @@ -7268,9 +7286,11 @@ static void __sdt_free(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); free_percpu(sdd->sg); + free_percpu(sdd->sgp); } } -- cgit v1.2.3-18-g5258 From e3589f6c81e4764d32a25d2a2a0afe54fa344f5c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jul 2011 10:35:52 +0200 Subject: sched: Allow for overlapping sched_domain spans Allow for sched_domain spans that overlap by giving such domains their own sched_group list instead of sharing the sched_groups amongst each-other. This is needed for machines with more than 16 nodes, because sched_domain_node_span() will generate a node mask from the 16 nearest nodes without regard if these masks have any overlap. Currently sched_domains have a sched_group that maps to their child sched_domain span, and since there is no overlap we share the sched_group between the sched_domains of the various CPUs. If however there is overlap, we would need to link the sched_group list in different ways for each cpu, and hence sharing isn't possible. In order to solve this, allocate private sched_groups for each CPU's sched_domain but have the sched_groups share a sched_group_power structure such that we can uniquely track the power. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 128 insertions(+), 29 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 36c10d25d4c..921adf6f6fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) { + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { kfree(sd->groups->sgp); kfree(sd->groups); } @@ -6967,15 +6993,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ } return cpu; } /* - * build_sched_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed */ -static void -build_sched_groups(struct sched_domain *sd) +static int +build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; @@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd) struct cpumask *covered; int i; + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; @@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd) last = sg; } last->next = first; + + return 0; } /* @@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd) */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - WARN_ON(!sd || !sd->groups); + struct sched_group *sg = sd->groups; - if (cpu != group_first_cpu(sd->groups)) - return; + WARN_ON(!sd || !sg); + + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + if (cpu != group_first_cpu(sg)) + return; update_group_power(sd, cpu); } @@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static void claim_allocations(int cpu, struct sched_domain *sd) { struct sd_data *sdd = sd->private; - struct sched_group *sg = sd->groups; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (cpu == cpumask_first(sched_group_cpus(sg))) { - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) *per_cpu_ptr(sdd->sgp, cpu) = NULL; - } } #ifdef CONFIG_SCHED_SMT @@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = { #endif { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, }, + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, @@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(sdd->sd, j)); + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sgp, j)); } @@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) + for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + } while (sd->child) sd = sd->child; @@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - get_group(i, sd->private, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (i != cpumask_first(sched_domain_span(sd))) - continue; - - build_sched_groups(sd); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } } } -- cgit v1.2.3-18-g5258 From d110235d2c331c4f79e0879f51104be79e17a469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jul 2011 18:42:57 +0200 Subject: sched: Avoid creating superfluous NUMA domains on non-NUMA systems When creating sched_domains, stop when we've covered the entire target span instead of continuing to create domains, only to later find they're redundant and throw them away again. This avoids single node systems from touching funny NUMA sched_domain creation code and reduces the risks of the new SD_OVERLAP code. Requested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Anton Blanchard Cc: mahesh@linux.vnet.ibm.com Cc: benh@kernel.crashing.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1311180177.29152.57.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 921adf6f6fa..14168c49a15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7436,6 +7436,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; } while (sd->child) -- cgit v1.2.3-18-g5258 From c5d753a55ac92e09816d410cd17093813f1a904b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Jul 2011 15:07:25 -0700 Subject: sched: Add irq_{enter,exit}() to scheduler_ipi() Ensure scheduler_ipi() calls irq_{enter,exit} when it does some actual work. Traditionally we never did any actual work from the resched IPI and all magic happened in the return from interrupt path. Now that we do do some work, we need to ensure irq_{enter,exit} are called so that we don't confuse things. This affects things like timekeeping, NO_HZ and RCU, basically everything with a hook in irq_enter/exit. Explicit examples of things going wrong are: sched_clock_cpu() -- has a callback when leaving NO_HZ state to take a new reading from GTOD and TSC. Without this callback, time is stuck in the past. RCU -- needs in_irq() to work in order to avoid some nasty deadlocks Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9769c756ad6..1930ee19d98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2544,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +static void sched_ttwu_do_pending(struct task_struct *list) { struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; raw_spin_lock(&rq->lock); @@ -2563,9 +2559,45 @@ static void sched_ttwu_pending(void) raw_spin_unlock(&rq->lock); } +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + void scheduler_ipi(void) { - sched_ttwu_pending(); + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) -- cgit v1.2.3-18-g5258