aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-01-02 11:44:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-02 11:44:09 -0800
commitb840d79631c882786925303c2b0f4fefc31845ed (patch)
treecda60a95d4507fe1321fc285af38982d7eb9693b /kernel
parent597b0d21626da4e6f09f132442caf0cc2b0eb47c (diff)
parentc3d80000e3a812fe5a200d6bde755fbd7fa65481 (diff)
Merge branch 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (66 commits) x86: export vector_used_by_percpu_irq x86: use logical apicid in x2apic_cluster's x2apic_cpu_mask_to_apicid_and() sched: nominate preferred wakeup cpu, fix x86: fix lguest used_vectors breakage, -v2 x86: fix warning in arch/x86/kernel/io_apic.c sched: fix warning in kernel/sched.c sched: move test_sd_parent() to an SMP section of sched.h sched: add SD_BALANCE_NEWIDLE at MC and CPU level for sched_mc>0 sched: activate active load balancing in new idle cpus sched: bias task wakeups to preferred semi-idle packages sched: nominate preferred wakeup cpu sched: favour lower logical cpu number for sched_mc balance sched: framework for sched_mc/smt_power_savings=N sched: convert BALANCE_FOR_xx_POWER to inline functions x86: use possible_cpus=NUM to extend the possible cpus allowed x86: fix cpu_mask_to_apicid_and to include cpu_online_mask x86: update io_apic.c to the new cpumask code x86: Introduce topology_core_cpumask()/topology_thread_cpumask() x86: xen: use smp_call_function_many() x86: use work_on_cpu in x86/kernel/cpu/mcheck/mce_amd_64.c ... Fixed up trivial conflict in kernel/time/tick-sched.c manually
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/manage.c22
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/proc.c29
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcuclassic.c2
-rw-r--r--kernel/sched.c970
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_fair.c32
-rw-r--r--kernel/sched_rt.c73
-rw-r--r--kernel/sched_stats.h3
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/trace/trace.c4
20 files changed, 722 insertions, 520 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ea32e8d68b..bae131a1211 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,19 +24,20 @@
cpumask_t cpu_present_map __read_mostly;
EXPORT_SYMBOL(cpu_present_map);
-#ifndef CONFIG_SMP
-
/*
* Represents all cpu's that are currently online.
*/
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_online_map __read_mostly;
EXPORT_SYMBOL(cpu_online_map);
+#ifdef CONFIG_INIT_ALL_POSSIBLE
cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+#else
+cpumask_t cpu_possible_map __read_mostly;
+#endif
EXPORT_SYMBOL(cpu_possible_map);
-#else /* CONFIG_SMP */
-
+#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8c..39c1a4c1c5a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
if (!*buf) {
cpus_clear(trialcs.cpus_allowed);
} else {
- retval = cpulist_parse(buf, trialcs.cpus_allowed);
+ retval = cpulist_parse(buf, &trialcs.cpus_allowed);
if (retval < 0)
return retval;
@@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
mask = cs->cpus_allowed;
mutex_unlock(&callback_mutex);
- return cpulist_scnprintf(page, PAGE_SIZE, mask);
+ return cpulist_scnprintf(page, PAGE_SIZE, &mask);
}
static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6eb3c7952b6..f63c706d25e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
desc->irq_count = 0;
desc->irqs_unhandled = 0;
#ifdef CONFIG_SMP
- cpus_setall(desc->affinity);
+ cpumask_setall(&desc->affinity);
#endif
spin_unlock_irqrestore(&desc->lock, flags);
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 540f6c49f3f..61c4a9b6216 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq)
* @cpumask: cpumask
*
*/
-int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- desc->affinity = cpumask;
+ cpumask_copy(&desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
} else {
desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = cpumask;
+ cpumask_copy(&desc->pending_mask, cpumask);
}
#else
- desc->affinity = cpumask;
+ cpumask_copy(&desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
#endif
desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
*/
int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
{
- cpumask_t mask;
-
if (!irq_can_set_affinity(irq))
return 0;
- cpus_and(mask, cpu_online_map, irq_default_affinity);
-
/*
* Preserve an userspace affinity setup, but make sure that
* one of the targets is online.
*/
if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpus_intersects(desc->affinity, cpu_online_map))
- mask = desc->affinity;
+ if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+ < nr_cpu_ids)
+ goto set_affinity;
else
desc->status &= ~IRQ_AFFINITY_SET;
}
- desc->affinity = mask;
- desc->chip->set_affinity(irq, mask);
+ cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+set_affinity:
+ desc->chip->set_affinity(irq, &desc->affinity);
return 0;
}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d9581..bd72329e630 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
void move_masked_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- cpumask_t tmp;
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
desc->status &= ~IRQ_MOVE_PENDING;
- if (unlikely(cpus_empty(desc->pending_mask)))
+ if (unlikely(cpumask_empty(&desc->pending_mask)))
return;
if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
assert_spin_locked(&desc->lock);
- cpus_and(tmp, desc->pending_mask, cpu_online_map);
-
/*
* If there was a valid mask to work with, please
* do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (likely(!cpus_empty(tmp))) {
- desc->chip->set_affinity(irq,tmp);
+ if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+ < nr_cpu_ids)) {
+ cpumask_and(&desc->affinity,
+ &desc->pending_mask, cpu_online_mask);
+ desc->chip->set_affinity(irq, &desc->affinity);
}
- cpus_clear(desc->pending_mask);
+ cpumask_clear(&desc->pending_mask);
}
void move_native_irq(int irq)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f6b3440f05b..d2c0e5ee53c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
- cpumask_t new_value;
+ cpumask_var_t new_value;
int err;
if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
irq_balancing_disabled(irq))
return -EIO;
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
err = cpumask_parse_user(buffer, count, new_value);
if (err)
- return err;
+ goto free_cpumask;
- if (!is_affinity_mask_valid(new_value))
- return -EINVAL;
+ if (!is_affinity_mask_valid(*new_value)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
- if (!cpus_intersects(new_value, cpu_online_map))
+ if (!cpumask_intersects(new_value, cpu_online_mask)) {
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
- return irq_select_affinity_usr(irq) ? -EINVAL : count;
-
- irq_set_affinity(irq, new_value);
+ err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+ } else {
+ irq_set_affinity(irq, new_value);
+ err = count;
+ }
- return count;
+free_cpumask:
+ free_cpumask_var(new_value);
+ return err;
}
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -95,7 +104,7 @@ static ssize_t default_affinity_write(struct file *file,
cpumask_t new_value;
int err;
- err = cpumask_parse_user(buffer, count, new_value);
+ err = cpumask_parse_user(buffer, count, &new_value);
if (err)
return err;
diff --git a/kernel/profile.c b/kernel/profile.c
index 60adefb59b5..4cb7d68fed8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,7 +442,7 @@ void profile_tick(int type)
static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+ int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
if (count - len < 2)
return -EINVAL;
len += sprintf(page + len, "\n");
@@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file,
unsigned long full_count = count, err;
cpumask_t new_value;
- err = cpumask_parse_user(buffer, count, new_value);
+ err = cpumask_parse_user(buffer, count, &new_value);
if (err)
return err;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f33..c03ca3e6191 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
* unnecessarily.
*/
smp_mb();
- cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+ cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
rcp->signaled = 0;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index fff1c4a20b6..27ba1d642f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -498,18 +498,26 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
- cpumask_t span;
- cpumask_t online;
+ cpumask_var_t span;
+ cpumask_var_t online;
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
- cpumask_t rto_mask;
+ cpumask_var_t rto_mask;
atomic_t rto_count;
#ifdef CONFIG_SMP
struct cpupri cpupri;
#endif
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /*
+ * Preferred wake up cpu nominated by sched_mc balance that will be
+ * used when most cpus are idle in the system indicating overall very
+ * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+ */
+ unsigned int sched_mc_preferred_wakeup_cpu;
+#endif
};
/*
@@ -1514,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
struct sched_domain *sd = data;
int i;
- for_each_cpu_mask(i, sd->span) {
+ for_each_cpu(i, sched_domain_span(sd)) {
/*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
@@ -1535,7 +1543,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
- for_each_cpu_mask(i, sd->span)
+ for_each_cpu(i, sched_domain_span(sd))
update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
@@ -2101,15 +2109,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
int i;
/* Skip over this group if it has no CPUs allowed */
- if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ if (!cpumask_intersects(sched_group_cpus(group),
+ &p->cpus_allowed))
continue;
- local_group = cpu_isset(this_cpu, group->cpumask);
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
avg_load = 0;
- for_each_cpu_mask_nr(i, group->cpumask) {
+ for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
@@ -2141,17 +2151,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
- cpumask_t *tmp)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
/* Traverse only the allowed CPUs */
- cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-
- for_each_cpu_mask_nr(i, *tmp) {
+ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2193,7 +2200,6 @@ static int sched_balance_self(int cpu, int flag)
update_shares(sd);
while (sd) {
- cpumask_t span, tmpmask;
struct sched_group *group;
int new_cpu, weight;
@@ -2202,14 +2208,13 @@ static int sched_balance_self(int cpu, int flag)
continue;
}
- span = sd->span;
group = find_idlest_group(sd, t, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+ new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
@@ -2218,10 +2223,10 @@ static int sched_balance_self(int cpu, int flag)
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
+ weight = cpumask_weight(sched_domain_span(sd));
sd = NULL;
- weight = cpus_weight(span);
for_each_domain(cpu, tmp) {
- if (weight <= cpus_weight(tmp->span))
+ if (weight <= cpumask_weight(sched_domain_span(tmp)))
break;
if (tmp->flags & flag)
sd = tmp;
@@ -2266,7 +2271,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
cpu = task_cpu(p);
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
update_shares(sd);
break;
}
@@ -2315,7 +2320,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
else {
struct sched_domain *sd;
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd, ttwu_wake_remote);
break;
}
@@ -2846,7 +2851,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
struct rq *rq;
rq = task_rq_lock(p, &flags);
- if (!cpu_isset(dest_cpu, p->cpus_allowed)
+ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
|| unlikely(!cpu_active(dest_cpu)))
goto out;
@@ -2911,7 +2916,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
schedstat_inc(p, se.nr_failed_migrations_affine);
return 0;
}
@@ -3086,7 +3091,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const cpumask_t *cpus, int *balance)
+ int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3122,10 +3127,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
- local_group = cpu_isset(this_cpu, group->cpumask);
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
if (local_group)
- balance_cpu = first_cpu(group->cpumask);
+ balance_cpu = cpumask_first(sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3134,13 +3140,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
max_cpu_load = 0;
min_cpu_load = ~0UL;
- for_each_cpu_mask_nr(i, group->cpumask) {
- struct rq *rq;
-
- if (!cpu_isset(i, *cpus))
- continue;
-
- rq = cpu_rq(i);
+ for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
@@ -3251,8 +3252,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
*/
if ((sum_nr_running < min_nr_running) ||
(sum_nr_running == min_nr_running &&
- first_cpu(group->cpumask) <
- first_cpu(group_min->cpumask))) {
+ cpumask_first(sched_group_cpus(group)) >
+ cpumask_first(sched_group_cpus(group_min)))) {
group_min = group;
min_nr_running = sum_nr_running;
min_load_per_task = sum_weighted_load /
@@ -3267,8 +3268,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (sum_nr_running <= group_capacity - 1) {
if (sum_nr_running > leader_nr_running ||
(sum_nr_running == leader_nr_running &&
- first_cpu(group->cpumask) >
- first_cpu(group_leader->cpumask))) {
+ cpumask_first(sched_group_cpus(group)) <
+ cpumask_first(sched_group_cpus(group_leader)))) {
group_leader = group;
leader_nr_running = sum_nr_running;
}
@@ -3394,6 +3395,10 @@ out_balanced:
if (this == group_leader && group_leader != group_min) {
*imbalance = min_load_per_task;
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ cpumask_first(sched_group_cpus(group_leader));
+ }
return group_min;
}
#endif
@@ -3407,16 +3412,16 @@ ret:
*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- unsigned long imbalance, const cpumask_t *cpus)
+ unsigned long imbalance, const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;
- for_each_cpu_mask_nr(i, group->cpumask) {
+ for_each_cpu(i, sched_group_cpus(group)) {
unsigned long wl;
- if (!cpu_isset(i, *cpus))
+ if (!cpumask_test_cpu(i, cpus))
continue;
rq = cpu_rq(i);
@@ -3446,7 +3451,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance, cpumask_t *cpus)
+ int *balance, struct cpumask *cpus)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
@@ -3454,7 +3459,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
unsigned long flags;
- cpus_setall(*cpus);
+ cpumask_setall(cpus);
/*
* When power savings policy is enabled for the parent domain, idle
@@ -3514,8 +3519,8 @@ redo:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
- cpu_clear(cpu_of(busiest), *cpus);
- if (!cpus_empty(*cpus))
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (!cpumask_empty(cpus))
goto redo;
goto out_balanced;
}
@@ -3532,7 +3537,8 @@ redo:
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
- if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu,
+ &busiest->curr->cpus_allowed)) {
spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
@@ -3607,7 +3613,7 @@ out:
*/
static int
load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- cpumask_t *cpus)
+ struct cpumask *cpus)
{
struct sched_group *group;
struct rq *busiest = NULL;
@@ -3616,7 +3622,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
int sd_idle = 0;
int all_pinned = 0;
- cpus_setall(*cpus);
+ cpumask_setall(cpus);
/*
* When power savings policy is enabled for the parent domain, idle
@@ -3660,17 +3666,71 @@ redo:
double_unlock_balance(this_rq, busiest);
if (unlikely(all_pinned)) {
- cpu_clear(cpu_of(busiest), *cpus);
- if (!cpus_empty(*cpus))
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (!cpumask_empty(cpus))
goto redo;
}
}
if (!ld_moved) {
+ int active_balance = 0;
+
schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
+
+ if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+ return -1;
+
+ if (sd->nr_balance_failed++ < 2)
+ return -1;
+
+ /*
+ * The only task running in a non-idle cpu can be moved to this
+ * cpu in an attempt to completely freeup the other CPU
+ * package. The same method used to move task in load_balance()
+ * have been extended for load_balance_newidle() to speedup
+ * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+ *
+ * The package power saving logic comes from
+ * find_busiest_group(). If there are no imbalance, then
+ * f_b_g() will return NULL. However when sched_mc={1,2} then
+ * f_b_g() will select a group from which a running task may be
+ * pulled to this cpu in order to make the other package idle.
+ * If there is no opportunity to make a package idle and if
+ * there are no imbalance, then f_b_g() will return NULL and no
+ * action will be taken in load_balance_newidle().
+ *
+ * Under normal task pull operation due to imbalance, there
+ * will be more than one task in the source run queue and
+ * move_tasks() will succeed. ld_moved will be true and this
+ * active balance code will not be triggered.
+ */
+
+ /* Lock busiest in correct order while this_rq is held */
+ double_lock_balance(this_rq, busiest);
+
+ /*
+ * don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ double_unlock_balance(this_rq, busiest);
+ all_pinned = 1;
+ return ld_moved;
+ }
+
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ active_balance = 1;
+ }
+
+ double_unlock_balance(this_rq, busiest);
+ if (active_balance)
+ wake_up_process(busiest->migration_thread);
+
} else
sd->nr_balance_failed = 0;
@@ -3696,7 +3756,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- cpumask_t tmpmask;
+ cpumask_var_t tmpmask;
+
+ if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+ return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -3707,7 +3770,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu, this_rq,
- sd, &tmpmask);
+ sd, tmpmask);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
@@ -3722,6 +3785,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
*/
this_rq->next_balance = next_balance;
}
+ free_cpumask_var(tmpmask);
}
/*
@@ -3759,7 +3823,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
- cpu_isset(busiest_cpu, sd->span))
+ cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
@@ -3778,10 +3842,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
#ifdef CONFIG_NO_HZ
static struct {
atomic_t load_balancer;
- cpumask_t cpu_mask;
+ cpumask_var_t cpu_mask;
} nohz ____cacheline_aligned = {
.load_balancer = ATOMIC_INIT(-1),
- .cpu_mask = CPU_MASK_NONE,
};
/*
@@ -3809,7 +3872,7 @@ int select_nohz_load_balancer(int stop_tick)
int cpu = smp_processor_id();
if (stop_tick) {
- cpu_set(cpu, nohz.cpu_mask);
+ cpumask_set_cpu(cpu, nohz.cpu_mask);
cpu_rq(cpu)->in_nohz_recently = 1;
/*
@@ -3823,7 +3886,7 @@ int select_nohz_load_balancer(int stop_tick)
}
/* time for ilb owner also to sleep */
- if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
if (atomic_read(&nohz.load_balancer) == cpu)
atomic_set(&nohz.load_balancer, -1);
return 0;
@@ -3836,10 +3899,10 @@ int select_nohz_load_balancer(int stop_tick)
} else if (atomic_read(&nohz.load_balancer) == cpu)
return 1;
} else {
- if (!cpu_isset(cpu, nohz.cpu_mask))
+ if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
return 0;
- cpu_clear(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.cpu_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3867,7 +3930,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
- cpumask_t tmp;
+ cpumask_var_t tmp;
+
+ /* Fails alloc? Rebalancing probably not a priority right now. */
+ if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+ return;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3892,7 +3959,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+ if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -3926,6 +3993,8 @@ out:
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
+
+ free_cpumask_var(tmp);
}
/*
@@ -3950,12 +4019,13 @@ static void run_rebalance_domains(struct softirq_action *h)
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
- cpumask_t cpus = nohz.cpu_mask;
struct rq *rq;
int balance_cpu;
- cpu_clear(this_cpu, cpus);
- for_each_cpu_mask_nr(balance_cpu, cpus) {
+ for_each_cpu(balance_cpu, nohz.cpu_mask) {
+ if (balance_cpu == this_cpu)
+ continue;
+
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
@@ -3993,7 +4063,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) {
- cpu_clear(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
@@ -4006,7 +4076,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
* TBD: Traverse the sched domains and nominate
* the nearest cpu in the nohz.cpu_mask.
*/
- int ilb = first_cpu(nohz.cpu_mask);
+ int ilb = cpumask_first(nohz.cpu_mask);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);
@@ -4018,7 +4088,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
resched_cpu(cpu);
return;
}
@@ -4028,7 +4098,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- cpu_isset(cpu, nohz.cpu_mask))
+ cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
if (time_after_eq(jiffies, rq->next_balance))
@@ -5401,10 +5471,9 @@ out_unlock:
return retval;
}
-long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
- cpumask_t cpus_allowed;
- cpumask_t new_mask = *in_mask;
+ cpumask_var_t cpus_allowed, new_mask;
struct task_struct *p;
int retval;
@@ -5426,6 +5495,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
get_task_struct(p);
read_unlock(&tasklist_lock);
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
retval = -EPERM;
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
@@ -5434,37 +5511,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
if (retval)
goto out_unlock;
- cpuset_cpus_allowed(p, &cpus_allowed);
- cpus_and(new_mask, new_mask, cpus_allowed);
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
again:
- retval = set_cpus_allowed_ptr(p, &new_mask);
+ retval = set_cpus_allowed_ptr(p, new_mask);
if (!retval) {
- cpuset_cpus_allowed(p, &cpus_allowed);
- if (!cpus_subset(new_mask, cpus_allowed)) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
/*
* We must have raced with a concurrent cpuset
* update. Just reset the cpus_allowed to the
* cpuset's cpus_allowed
*/
- new_mask = cpus_allowed;
+ cpumask_copy(new_mask, cpus_allowed);
goto again;
}
}
out_unlock:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+out_put_task:
put_task_struct(p);