diff options
Diffstat (limited to 'kernel/sched/sched.h')
| -rw-r--r-- | kernel/sched/sched.h | 244 | 
1 files changed, 190 insertions, 54 deletions
| diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469..ce39224d615 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,8 +5,10 @@  #include <linux/mutex.h>  #include <linux/spinlock.h>  #include <linux/stop_machine.h> +#include <linux/tick.h>  #include "cpupri.h" +#include "cpuacct.h"  extern __read_mostly int scheduler_running; @@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running;   */  #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */ +# define SCHED_LOAD_RESOLUTION	10 +# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION	0 +# define scale_load(w)		(w) +# define scale_load_down(w)	(w) +#endif + +#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT) +  #define NICE_0_LOAD		SCHED_LOAD_SCALE  #define NICE_0_SHIFT		SCHED_LOAD_SHIFT @@ -154,11 +181,6 @@ struct task_group {  #define MAX_SHARES	(1UL << 18)  #endif -/* Default task group. - *	Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; -  typedef int (*tg_visitor)(struct task_group *, void *);  extern int walk_tg_tree_from(struct task_group *from, @@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,  		struct sched_rt_entity *rt_se, int cpu,  		struct sched_rt_entity *parent); +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, +			       struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif +  #else /* CONFIG_CGROUP_SCHED */  struct cfs_bandwidth { }; @@ -372,10 +406,13 @@ struct rq {  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX];  	unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	u64 nohz_stamp;  	unsigned long nohz_flags;  #endif +#ifdef CONFIG_NO_HZ_FULL +	unsigned long last_sched_tick; +#endif  	int skip_clock_update;  	/* capture load from *all* tasks on this cpu: */ @@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)  DECLARE_PER_CPU(struct sched_domain *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_id); +struct sched_group_power { +	atomic_t ref; +	/* +	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a +	 * single CPU. +	 */ +	unsigned int power, power_orig; +	unsigned long next_update; +	/* +	 * Number of busy cpus in this group. +	 */ +	atomic_t nr_busy_cpus; + +	unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { +	struct sched_group *next;	/* Must be a circular list */ +	atomic_t ref; + +	unsigned int group_weight; +	struct sched_group_power *sgp; + +	/* +	 * The CPUs this group covers. +	 * +	 * NOTE: this field is variable length. (Allocated dynamically +	 * by attaching extra space to the end of the structure, +	 * depending on how many CPUs the kernel has booted up with) +	 */ +	unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ +	return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ +	return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ +	return cpumask_first(sched_group_cpus(group)); +} +  extern int group_balance_cpu(struct sched_group *sg);  #endif /* CONFIG_SMP */ @@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  }  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +/* + * wake flags + */ +#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */ +#define WF_FORK		0x02		/* child wakeup after fork */ +#define WF_MIGRATED	0x4		/* internal use, task got migrated */  static inline void update_load_add(struct load_weight *lw, unsigned long inc)  { @@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = {   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,  }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { -	CPUACCT_STAT_USER,	/* ... user mode */ -	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */ +#define ENQUEUE_WAKEUP		1 +#define ENQUEUE_HEAD		2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING		0 +#endif -	CPUACCT_STAT_NSTATS, -}; +#define DEQUEUE_SLEEP		1 +struct sched_class { +	const struct sched_class *next; + +	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); +	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); +	void (*yield_task) (struct rq *rq); +	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + +	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + +	struct task_struct * (*pick_next_task) (struct rq *rq); +	void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP +	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); +	void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + +	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); +	void (*post_schedule) (struct rq *this_rq); +	void (*task_waking) (struct task_struct *task); +	void (*task_woken) (struct rq *this_rq, struct task_struct *task); + +	void (*set_cpus_allowed)(struct task_struct *p, +				 const struct cpumask *newmask); + +	void (*rq_online)(struct rq *rq); +	void (*rq_offline)(struct rq *rq); +#endif + +	void (*set_curr_task) (struct rq *rq); +	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); +	void (*task_fork) (struct task_struct *p); + +	void (*switched_from) (struct rq *this_rq, struct task_struct *task); +	void (*switched_to) (struct rq *this_rq, struct task_struct *task); +	void (*prio_changed) (struct rq *this_rq, struct task_struct *task, +			     int oldprio); + +	unsigned int (*get_rr_interval) (struct rq *rq, +					 struct task_struct *task); + +#ifdef CONFIG_FAIR_GROUP_SCHED +	void (*task_move_group) (struct task_struct *p, int on_rq); +#endif +};  #define sched_class_highest (&stop_sched_class)  #define for_each_class(class) \ @@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class;  #ifdef CONFIG_SMP +extern void update_group_power(struct sched_domain *sd, int cpu); +  extern void trigger_load_balance(struct rq *rq, int cpu);  extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif +  #else	/* CONFIG_SMP */  static inline void idle_balance(int cpu, struct rq *rq) @@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq)  extern void sysrq_sched_debug_show(void);  extern void sched_init_granularity(void);  extern void update_max_interval(void); -extern void update_group_power(struct sched_domain *sd, int cpu);  extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);  extern void init_sched_rt_class(void);  extern void init_sched_fair_class(void); @@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime  extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { -	struct cgroup_subsys_state css; -	/* cpuusage holds pointer to a u64-type object on every cpu */ -	u64 __percpu *cpuusage; -	struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ -	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), -			    struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ -	return container_of(task_subsys_state(tsk, cpuacct_subsys_id), -			    struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ -	if (!ca || !ca->css.cgroup->parent) -		return NULL; -	return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif -  #ifdef CONFIG_PARAVIRT  static inline u64 steal_ticks(u64 steal)  { @@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)  static inline void inc_nr_running(struct rq *rq)  {  	rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL +	if (rq->nr_running == 2) { +		if (tick_nohz_full_cpu(rq->cpu)) { +			/* Order rq->nr_running write against the IPI */ +			smp_wmb(); +			smp_send_reschedule(rq->cpu); +		} +       } +#endif  }  static inline void dec_nr_running(struct rq *rq) @@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)  	rq->nr_running--;  } +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL +	rq->last_sched_tick = jiffies; +#endif +} +  extern void update_rq_clock(struct rq *rq);  extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);  extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  enum rq_nohz_flag_bits {  	NOHZ_TICK_STOPPED,  	NOHZ_BALANCE_KICK, -	NOHZ_IDLE,  };  #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags) | 
