diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-01-05 13:53:39 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-01-05 13:53:39 +0100 |
commit | 5359c32eb7402124abc9964d5d53639fe0739cea (patch) | |
tree | d77b6967fe8420678bb9d1d936855ac0699c196a /kernel/sched.c | |
parent | 8916edef5888c5d8fe283714416a9ca95b4c3431 (diff) | |
parent | fe0bdec68b77020281dc814805edfe594ae89e0f (diff) |
Merge branch 'linus' into sched/urgent
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1090 |
1 files changed, 623 insertions, 467 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index fff1c4a20b6..545c6fccd1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -498,18 +498,26 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; #endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif }; /* @@ -1514,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data) struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1535,7 +1543,7 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@ -2101,15 +2109,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2141,17 +2151,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2193,7 +2200,6 @@ static int sched_balance_self(int cpu, int flag) update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -2202,14 +2208,13 @@ static int sched_balance_self(int cpu, int flag) continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2218,10 +2223,10 @@ static int sched_balance_self(int cpu, int flag) /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@ -2266,7 +2271,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@ -2315,7 +2320,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@ -2846,7 +2851,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; @@ -2911,7 +2916,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@ -3086,7 +3091,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3122,10 +3127,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3134,13 +3140,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@ -3251,8 +3252,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3267,8 +3268,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3394,6 +3395,10 @@ out_balanced: if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + cpumask_first(sched_group_cpus(group_leader)); + } return group_min; } #endif @@ -3407,16 +3412,16 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@ -3446,7 +3451,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -3454,7 +3459,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3514,8 +3519,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@ -3532,7 +3537,8 @@ redo: /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@ -3607,7 +3613,7 @@ out: */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3616,7 +3622,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3660,17 +3666,71 @@ redo: double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } if (!ld_moved) { + int active_balance = 0; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + if (active_balance) + wake_up_process(busiest->migration_thread); + } else sd->nr_balance_failed = 0; @@ -3696,7 +3756,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3707,7 +3770,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3722,6 +3785,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* @@ -3759,7 +3823,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@ -3778,10 +3842,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@ -3809,7 +3872,7 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@ -3823,7 +3886,7 @@ int select_nohz_load_balancer(int stop_tick) } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -3836,10 +3899,10 @@ int select_nohz_load_balancer(int stop_tick) } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@ -3867,7 +3930,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3892,7 +3959,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3926,6 +3993,8 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* @@ -3950,12 +4019,13 @@ static void run_rebalance_domains(struct softirq_action *h) */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -3993,7 +4063,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@ -4006,7 +4076,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4018,7 +4088,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@ -4028,7 +4098,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@ -4080,13 +4150,17 @@ unsigned long long task_delta_exec(struct task_struct *p) * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + /* Add user time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); /* Add user time to cpustat. */ @@ -4103,51 +4177,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { cputime64_t tmp; struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; tmp = cputime_to_cputime64(cputime); + /* Add guest time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); + /* Add guest time to cpustat. */ cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->guest = cputime64_add(cpustat->guest, tmp); } /* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - -/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) + cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); + account_guest_time(p, cputime, cputime_scaled); return; } + /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -4156,48 +4227,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else - cpustat->idle = cputime64_add(cpustat->idle, tmp); + cpustat->system = cputime64_add(cpustat->system, tmp); + /* Account for system time used */ acct_update_integrals(p); } /* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update + * Account for involuntary wait time. + * @steal: the cpu time spent in involuntary wait */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) +void account_steal_time(cputime_t cputime) { - p->stimescaled = cputime_add(p->stimescaled, cputime); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); } /* - * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen - * @steal: the cpu time spent in involuntary wait + * Account for idle time. + * @cputime: the cpu time spent in idle wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +void account_idle_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + /* * Use precise platform statistics if available: */ @@ -5401,10 +5508,9 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@ -5426,6 +5532,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; @@ -5434,37 +5548,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@ -5477,17 +5595,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@ -5504,7 +5625,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@ -5523,19 +5644,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** @@ -5877,7 +6003,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5904,9 +6030,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@ -5961,7 +6087,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@ -5969,13 +6095,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@ -5983,15 +6109,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -6033,7 +6159,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@ -6130,50 +6256,41 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - cpumask_t mask; - struct rq *rq; int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); - do { - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* @@ -6185,7 +6302,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq |