diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 291 | 
1 files changed, 246 insertions, 45 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 5a5cc33e499..d42992bccdf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -426,9 +426,7 @@ struct root_domain {  	 */  	cpumask_var_t rto_mask;  	atomic_t rto_count; -#ifdef CONFIG_SMP  	struct cpupri cpupri; -#endif  };  /* @@ -437,7 +435,7 @@ struct root_domain {   */  static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */  /*   * This is the main, per-CPU runqueue data structure. @@ -488,11 +486,12 @@ struct rq {  	 */  	unsigned long nr_uninterruptible; -	struct task_struct *curr, *idle; +	struct task_struct *curr, *idle, *stop;  	unsigned long next_balance;  	struct mm_struct *prev_mm;  	u64 clock; +	u64 clock_task;  	atomic_t nr_iowait; @@ -520,6 +519,10 @@ struct rq {  	u64 avg_idle;  #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +	u64 prev_irq_time; +#endif +  	/* calc_load related fields */  	unsigned long calc_load_update;  	long calc_load_active; @@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)  #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); +  inline void update_rq_clock(struct rq *rq)  { -	if (!rq->skip_clock_update) -		rq->clock = sched_clock_cpu(cpu_of(rq)); +	if (!rq->skip_clock_update) { +		int cpu = cpu_of(rq); +		u64 irq_time; + +		rq->clock = sched_clock_cpu(cpu); +		irq_time = irq_time_cpu(cpu); +		if (rq->clock - irq_time > rq->clock_task) +			rq->clock_task = rq->clock - irq_time; + +		sched_irq_time_avg_update(rq, irq_time); +	}  }  /* @@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  		size_t cnt, loff_t *ppos)  {  	char buf[64]; -	char *cmp = buf; +	char *cmp;  	int neg = 0;  	int i; @@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  		return -EFAULT;  	buf[cnt] = 0; +	cmp = strstrip(buf);  	if (strncmp(buf, "NO_", 3) == 0) {  		neg = 1; @@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  	}  	for (i = 0; sched_feat_names[i]; i++) { -		int len = strlen(sched_feat_names[i]); - -		if (strncmp(cmp, sched_feat_names[i], len) == 0) { +		if (strcmp(cmp, sched_feat_names[i]) == 0) {  			if (neg)  				sysctl_sched_features &= ~(1UL << i);  			else @@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)  static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class)  #define for_each_class(class) \     for (class = sched_class_highest; class; class = class->next) @@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)  static void set_load_weight(struct task_struct *p)  { -	if (task_has_rt_policy(p)) { -		p->se.load.weight = 0; -		p->se.load.inv_weight = WMULT_CONST; -		return; -	} -  	/*  	 * SCHED_IDLE tasks get minimal weight:  	 */ @@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)  	dec_nr_running(rq);  } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 0; +} + +static u64 irq_time_cpu(int cpu) +{ +	if (!sched_clock_irqtime) +		return 0; + +	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + +void account_system_vtime(struct task_struct *curr) +{ +	unsigned long flags; +	int cpu; +	u64 now, delta; + +	if (!sched_clock_irqtime) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	now = sched_clock_cpu(cpu); +	delta = now - per_cpu(irq_start_time, cpu); +	per_cpu(irq_start_time, cpu) = now; +	/* +	 * We do not account for softirq time from ksoftirqd here. +	 * We want to continue accounting softirq time to ksoftirqd thread +	 * in that case, so as not to confuse scheduler with a special task +	 * that do not consume any time, but still wants to run. +	 */ +	if (hardirq_count()) +		per_cpu(cpu_hardirq_time, cpu) += delta; +	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) +		per_cpu(cpu_softirq_time, cpu) += delta; + +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ +	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { +		u64 delta_irq = curr_irq_time - rq->prev_irq_time; +		rq->prev_irq_time = curr_irq_time; +		sched_rt_avg_update(rq, delta_irq); +	} +} + +#else + +static u64 irq_time_cpu(int cpu) +{ +	return 0; +} + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + +#endif +  #include "sched_idletask.c"  #include "sched_fair.c"  #include "sched_rt.c" +#include "sched_stoptask.c"  #ifdef CONFIG_SCHED_DEBUG  # include "sched_debug.c"  #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ +	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; +	struct task_struct *old_stop = cpu_rq(cpu)->stop; + +	if (stop) { +		/* +		 * Make it appear like a SCHED_FIFO task, its something +		 * userspace knows about and won't get confused about. +		 * +		 * Also, it will make PI more or less work without too +		 * much confusion -- but then, stop work should not +		 * rely on PI working anyway. +		 */ +		sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + +		stop->sched_class = &stop_sched_class; +	} + +	cpu_rq(cpu)->stop = stop; + +	if (old_stop) { +		/* +		 * Reset it back to a normal scheduling class so that +		 * it can die in pieces. +		 */ +		old_stop->sched_class = &rt_sched_class; +	} +} +  /*   * __normal_prio - return the priority that is based on the static prio   */ @@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  	if (p->sched_class != &fair_sched_class)  		return 0; +	if (unlikely(p->policy == SCHED_IDLE)) +		return 0; +  	/*  	 * Buddy candidates are cache hot:  	 */ @@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,  	 */  	arch_start_context_switch(prev); -	if (likely(!mm)) { +	if (!mm) {  		next->active_mm = oldmm;  		atomic_inc(&oldmm->mm_count);  		enter_lazy_tlb(oldmm, next);  	} else  		switch_mm(oldmm, mm, next); -	if (likely(!prev->mm)) { +	if (!prev->mm) {  		prev->active_mm = NULL;  		rq->prev_mm = oldmm;  	} @@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)  	if (task_current(rq, p)) {  		update_rq_clock(rq); -		ns = rq->clock - p->se.exec_start; +		ns = rq->clock_task - p->se.exec_start;  		if ((s64)ns < 0)  			ns = 0;  	} @@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,  	tmp = cputime_to_cputime64(cputime);  	if (hardirq_count() - hardirq_offset)  		cpustat->irq = cputime64_add(cpustat->irq, tmp); -	else if (softirq_count()) +	else if (in_serving_softirq())  		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);  	else  		cpustat->system = cputime64_add(cpustat->system, tmp); @@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)  			return p;  	} -	class = sched_class_highest; -	for ( ; ; ) { +	for_each_class(class) {  		p = class->pick_next_task(rq);  		if (p)  			return p; -		/* -		 * Will never be NULL as the idle class always -		 * returns a non-NULL p: -		 */ -		class = class->next;  	} + +	BUG(); /* the idle class will always have a runnable task */  }  /* @@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	rq = task_rq_lock(p, &flags); +	trace_sched_pi_setprio(p, prio);  	oldprio = p->prio;  	prev_class = p->sched_class;  	on_rq = p->se.on_rq; @@ -4661,6 +4788,15 @@ recheck:  	 */  	rq = __task_rq_lock(p); +	/* +	 * Changing the policy of the stop threads its a very bad idea +	 */ +	if (p == rq->stop) { +		__task_rq_unlock(rq); +		raw_spin_unlock_irqrestore(&p->pi_lock, flags); +		return -EINVAL; +	} +  #ifdef CONFIG_RT_GROUP_SCHED  	if (user) {  		/* @@ -4893,7 +5029,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	cpuset_cpus_allowed(p, cpus_allowed);  	cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again:  	retval = set_cpus_allowed_ptr(p, new_mask);  	if (!retval) { @@ -6526,6 +6662,7 @@ struct s_data {  	cpumask_var_t		nodemask;  	cpumask_var_t		this_sibling_map;  	cpumask_var_t		this_core_map; +	cpumask_var_t		this_book_map;  	cpumask_var_t		send_covered;  	cpumask_var_t		tmpmask;  	struct sched_group	**sched_group_nodes; @@ -6537,6 +6674,7 @@ enum s_alloc {  	sa_rootdomain,  	sa_tmpmask,  	sa_send_covered, +	sa_this_book_map,  	sa_this_core_map,  	sa_this_sibling_map,  	sa_nodemask, @@ -6572,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,  #ifdef CONFIG_SCHED_MC  static DEFINE_PER_CPU(struct static_sched_domain, core_domains);  static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)  static int  cpu_to_core_group(int cpu, const struct cpumask *cpu_map,  		  struct sched_group **sg, struct cpumask *mask)  {  	int group; - +#ifdef CONFIG_SCHED_SMT  	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);  	group = cpumask_first(mask); +#else +	group = cpu; +#endif  	if (sg)  		*sg = &per_cpu(sched_group_core, group).sg;  	return group;  } -#elif defined(CONFIG_SCHED_MC) +#endif /* CONFIG_SCHED_MC */ + +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); +  static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, -		  struct sched_group **sg, struct cpumask *unused) +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, +		  struct sched_group **sg, struct cpumask *mask)  { +	int group = cpu; +#ifdef CONFIG_SCHED_MC +	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); +	group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) +	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); +	group = cpumask_first(mask); +#endif  	if (sg) -		*sg = &per_cpu(sched_group_core, cpu).sg; -	return cpu; +		*sg = &per_cpu(sched_group_book, group).sg; +	return group;  } -#endif +#endif /* CONFIG_SCHED_BOOK */  static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);  static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6606,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,  		  struct sched_group **sg, struct cpumask *mask)  {  	int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK +	cpumask_and(mask, cpu_book_mask(cpu), cpu_map); +	group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC)  	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);  	group = cpumask_first(mask);  #elif defined(CONFIG_SCHED_SMT) @@ -6867,6 +7025,9 @@ SD_INIT_FUNC(CPU)  #ifdef CONFIG_SCHED_MC   SD_INIT_FUNC(MC)  #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif  static int default_relax_domain_level = -1; @@ -6916,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,  		free_cpumask_var(d->tmpmask); /* fall through */  	case sa_send_covered:  		free_cpumask_var(d->send_covered); /* fall through */ +	case sa_this_book_map: +		free_cpumask_var(d->this_book_map); /* fall through */  	case sa_this_core_map:  		free_cpumask_var(d->this_core_map); /* fall through */  	case sa_this_sibling_map: @@ -6962,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,  		return sa_nodemask;  	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))  		return sa_this_sibling_map; -	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) +	if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))  		return sa_this_core_map; +	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) +		return sa_this_book_map;  	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))  		return sa_send_covered;  	d->rd = alloc_rootdomain(); @@ -7021,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,  	return sd;  } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, +	const struct cpumask *cpu_map, struct sched_domain_attr *attr, +	struct sched_domain *parent, int i) +{ +	struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK +	sd = &per_cpu(book_domains, i).sd; +	SD_INIT(sd, BOOK); +	set_domain_attribute(sd, attr); +	cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); +	sd->parent = parent; +	parent->child = sd; +	cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif +	return sd; +} +  static struct sched_domain *__build_mc_sched_domain(struct s_data *d,  	const struct cpumask *cpu_map, struct sched_domain_attr *attr,  	struct sched_domain *parent, int i) @@ -7078,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,  						d->send_covered, d->tmpmask);  		break;  #endif +#ifdef CONFIG_SCHED_BOOK +	case SD_LV_BOOK: /* set up book groups */ +		cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); +		if (cpu == cpumask_first(d->this_book_map)) +			init_sched_build_groups(d->this_book_map, cpu_map, +						&cpu_to_book_group, +						d->send_covered, d->tmpmask); +		break; +#endif  	case SD_LV_CPU: /* set up physical groups */  		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);  		if (!cpumask_empty(d->nodemask)) @@ -7125,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,  		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);  		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); +		sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);  		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);  		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);  	}  	for_each_cpu(i, cpu_map) {  		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); +		build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);  		build_sched_groups(&d, SD_LV_MC, cpu_map, i);  	} @@ -7161,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,  		init_sched_groups_power(i, sd);  	}  #endif +#ifdef CONFIG_SCHED_BOOK +	for_each_cpu(i, cpu_map) { +		sd = &per_cpu(book_domains, i).sd; +		init_sched_groups_power(i, sd); +	} +#endif  	for_each_cpu(i, cpu_map) {  		sd = &per_cpu(phys_domains, i).sd; @@ -7186,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,  		sd = &per_cpu(cpu_domains, i).sd;  #elif defined(CONFIG_SCHED_MC)  		sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) +		sd = &per_cpu(book_domains, i).sd;  #else  		sd = &per_cpu(phys_domains, i).sd;  #endif @@ -8090,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	return 1; - err_free_rq: +err_free_rq:  	kfree(cfs_rq); - err: +err:  	return 0;  } @@ -8180,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)  	return 1; - err_free_rq: +err_free_rq:  	kfree(rt_rq); - err: +err:  	return 0;  } @@ -8540,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,  		raw_spin_unlock(&rt_rq->rt_runtime_lock);  	}  	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock:  	read_unlock(&tasklist_lock);  	mutex_unlock(&rt_constraints_mutex);  | 
