diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-21 15:40:24 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-21 15:40:24 -0700 |
commit | ec965350bb98bd291eb34f6ecddfdcfc36da1e6e (patch) | |
tree | 983bcaf33ed00b48a86f7f8790cc460cf15dd252 /kernel | |
parent | 5f033bb9bc5cb3bb37a79e3ef131f50ecdcb72b0 (diff) | |
parent | 486fdae21458bd9f4e125099bb3c38a4064e450e (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel: (62 commits)
sched: build fix
sched: better rt-group documentation
sched: features fix
sched: /debug/sched_features
sched: add SCHED_FEAT_DEADLINE
sched: debug: show a weight tree
sched: fair: weight calculations
sched: fair-group: de-couple load-balancing from the rb-trees
sched: fair-group scheduling vs latency
sched: rt-group: optimize dequeue_rt_stack
sched: debug: add some debug code to handle the full hierarchy
sched: fair-group: SMP-nice for group scheduling
sched, cpuset: customize sched domains, core
sched, cpuset: customize sched domains, docs
sched: prepatory code movement
sched: rt: multi level group constraints
sched: task_group hierarchy
sched: fix the task_group hierarchy for UID grouping
sched: allow the group scheduler to have multiple levels
sched: mix tasks and groups
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/compat.c | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 100 | ||||
-rw-r--r-- | kernel/irq/chip.c | 2 | ||||
-rw-r--r-- | kernel/kmod.c | 2 | ||||
-rw-r--r-- | kernel/kthread.c | 1 | ||||
-rw-r--r-- | kernel/latencytop.c | 27 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 4 | ||||
-rw-r--r-- | kernel/rcutorture.c | 15 | ||||
-rw-r--r-- | kernel/sched.c | 1912 | ||||
-rw-r--r-- | kernel/sched_debug.c | 36 | ||||
-rw-r--r-- | kernel/sched_fair.c | 580 | ||||
-rw-r--r-- | kernel/sched_features.h | 10 | ||||
-rw-r--r-- | kernel/sched_rt.c | 227 | ||||
-rw-r--r-- | kernel/sched_stats.h | 8 | ||||
-rw-r--r-- | kernel/softirq.c | 63 | ||||
-rw-r--r-- | kernel/stop_machine.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 15 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 5 | ||||
-rw-r--r-- | kernel/user.c | 30 |
20 files changed, 2357 insertions, 690 deletions
diff --git a/kernel/compat.c b/kernel/compat.c index 9c48abfcd4a..e1ef04870c2 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -445,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abe..2011ad8d269 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ old_allowed = current->cpus_allowed; - tmp = CPU_MASK_ALL; + cpus_setall(tmp); cpu_clear(cpu, tmp); - set_cpus_allowed(current, tmp); + set_cpus_allowed_ptr(current, &tmp); p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); @@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) out_thread: err = kthread_stop(p); out_allowed: - set_cpus_allowed(current, old_allowed); + set_cpus_allowed_ptr(current, &old_allowed); out_release: cpu_hotplug_done(); return err; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f41422..8b35fbd8292 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -98,6 +98,9 @@ struct cpuset { /* partition number for rebuild_sched_domains() */ int pn; + /* for custom sched domain */ + int relax_domain_level; + /* used for walking a cpuset heirarchy */ struct list_head stack_list; }; @@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) return cpus_intersects(a->cpus_allowed, b->cpus_allowed); } +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (!dattr) + return; + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; + return; +} + /* * rebuild_sched_domains() * @@ -553,12 +566,14 @@ static void rebuild_sched_domains(void) int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ q = NULL; csa = NULL; doms = NULL; + dattr = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { @@ -566,6 +581,11 @@ static void rebuild_sched_domains(void) doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr(dattr, &top_cpuset); + } *doms = top_cpuset.cpus_allowed; goto rebuild; } @@ -622,6 +642,7 @@ restart: doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; @@ -644,12 +665,15 @@ restart: } cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; + update_domain_attr(dattr, b); } } nslot++; @@ -660,7 +684,7 @@ restart: rebuild: /* Have scheduler rebuild sched domains */ get_online_cpus(); - partition_sched_domains(ndoms, doms); + partition_sched_domains(ndoms, doms, dattr); put_online_cpus(); done: @@ -668,6 +692,7 @@ done: kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ + /* Don't kfree(dattr) -- partition_sched_domains() does that. */ } static inline int started_after_time(struct task_struct *t1, @@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) */ void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); + set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) return 0; } +static int update_relax_domain_level(struct cpuset *cs, char *buf) +{ + int val = simple_strtol(buf, NULL, 10); + + if (val < 0) + val = -1; + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + rebuild_sched_domains(); + } + + return 0; +} + /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, @@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed(tsk, cpus); + set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); from = oldcs->mems_allowed; @@ -1202,6 +1242,7 @@ typedef enum { FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_SCHED_LOAD_BALANCE, + FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, @@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, case FILE_SCHED_LOAD_BALANCE: retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, buffer); + break; case FILE_MEMORY_MIGRATE: retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); break; @@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_SCHED_LOAD_BALANCE: *s++ = is_sched_load_balance(cs) ? '1' : '0'; break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + s += sprintf(s, "%d", cs->relax_domain_level); + break; case FILE_MEMORY_MIGRATE: *s++ = is_memory_migrate(cs) ? '1' : '0'; break; @@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_balance = { .private = FILE_SCHED_LOAD_BALANCE, }; +static struct cftype cft_sched_relax_domain_level = { + .name = "sched_relax_domain_level", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, +}; + static struct cftype cft_memory_migrate = { .name = "memory_migrate", .read = cpuset_common_file_read, @@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) return err; if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) return err; + if ((err = cgroup_add_file(cont, ss, + &cft_sched_relax_domain_level)) < 0) + return err; if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) return err; if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) @@ -1555,10 +1612,11 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cs->cpus_allowed = CPU_MASK_NONE; - cs->mems_allowed = NODE_MASK_NONE; + cpus_clear(cs->cpus_allowed); + nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; cs->parent = parent; number_of_cpusets++; @@ -1625,12 +1683,13 @@ int __init cpuset_init(void) { int err = 0; - top_cpuset.cpus_allowed = CPU_MASK_ALL; - top_cpuset.mems_allowed = NODE_MASK_ALL; + cpus_setall(top_cpuset.cpus_allowed); + nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); + top_cpuset.relax_domain_level = -1; err = register_filesystem(&cpuset_fs_type); if (err < 0) @@ -1844,6 +1903,7 @@ void __init cpuset_init_smp(void) * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * * Description: Returns the cpumask_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty @@ -1851,35 +1911,27 @@ void __init cpuset_init_smp(void) * tasks cpuset. **/ -cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) +void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - mutex_lock(&callback_mutex); - mask = cpuset_cpus_allowed_locked(tsk); + cpuset_cpus_allowed_locked(tsk, pmask); mutex_unlock(&callback_mutex); - - return mask; } /** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), &mask); + guarantee_online_cpus(task_cs(tsk), pmask); task_unlock(tsk); - - return mask; } void cpuset_init_current_mems_allowed(void) { - current->mems_allowed = NODE_MASK_ALL; + nodes_setall(current->mems_allowed); } /** @@ -2261,8 +2313,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, task->cpus_allowed); seq_printf(m, "\n"); + seq_printf(m, "Cpus_allowed_list:\t"); + m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, + task->cpus_allowed); + seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, task->mems_allowed); seq_printf(m, "\n"); + seq_printf(m, "Mems_allowed_list:\t"); + m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, + task->mems_allowed); + seq_printf(m, "\n"); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c..964964baefa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - desc->affinity = CPU_MASK_ALL; + cpus_setall(desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f36..e2764047ec0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data) } /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed(current, CPU_MASK_ALL); + set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); /* * Our parent is keventd, which runs with elevated scheduling priority. diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f9..25241d6ec8c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) wait_task_inactive(k); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); + k->rt.nr_cpus_allowed = 1; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe7..7c74dab0d21 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record return; for (i = 0; i < MAXLR; i++) { - int q; - int same = 1; + int q, same = 1; + /* Nothing stored: */ if (!latency_record[i].backtrace[0]) { if (firstnonnull > i) @@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record continue; } for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (latency_record[i].backtrace[q] != - lat->backtrace[q]) + unsigned long record = lat->backtrace[q]; + + if (latency_record[i].backtrace[q] != record) { same = 0; - if (same && lat->backtrace[q] == 0) break; - if (same && lat->backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { @@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) for (i = 0; i < LT_SAVECOUNT ; i++) { struct latency_record *mylat; int same = 1; + mylat = &tsk->latency_record[i]; for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (mylat->backtrace[q] != - lat.backtrace[q]) + unsigned long record = lat.backtrace[q]; + + if (mylat->backtrace[q] != record) { same = 0; - if (same && lat.backtrace[q] == 0) break; - if (same && lat.backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57..e1cdf196a51 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1007,10 +1007,10 @@ void __synchronize_sched(void) if (sched_getaffinity(0, &oldmask) < 0) oldmask = cpu_possible_map; for_each_online_cpu(cpu) { - sched_setaffinity(0, cpumask_of_cpu(cpu)); + sched_setaffinity(0, &cpumask_of_cpu(cpu)); schedule(); } - sched_setaffinity(0, oldmask); + sched_setaffinity(0, &oldmask); } EXPORT_SYMBOL_GPL(__synchronize_sched); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72..47894f919d4 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask = CPU_MASK_ALL; + cpumask_t tmp_mask; int i; + cpus_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ @@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void) if (rcu_idle_cpu != -1) cpu_clear(rcu_idle_cpu, tmp_mask); - set_cpus_allowed(current, tmp_mask); + set_cpus_allowed_ptr(current, &tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) - set_cpus_allowed(reader_tasks[i], tmp_mask); + set_cpus_allowed_ptr(reader_tasks[i], + &tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) - set_cpus_allowed(fakewriter_tasks[i], tmp_mask); + set_cpus_allowed_ptr(fakewriter_tasks[i], + &tmp_mask); } if (writer_task) - set_cpus_allowed(writer_task, tmp_mask); + set_cpus_allowed_ptr(writer_task, &tmp_mask); if (stats_task) - set_cpus_allowed(stats_task, tmp_mask); + set_cpus_allowed_ptr(stats_task, &tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; diff --git a/kernel/sched.c b/kernel/sched.c index 8dcdec6fe0f..57ba7ea9b74 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -66,6 +66,10 @@ #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -155,6 +164,84 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + #ifdef CONFIG_GROUP_SCHED #include <linux/cgroup.h> @@ -181,29 +268,39 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; }; +#ifdef CONFIG_USER_SCHED + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; - -static struct sched_entity *init_sched_entity_p[NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; #endif #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - -static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; -static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif +#else +#define root_task_group init_task_group #endif /* task_group_lock serializes add/remove of task groups and also changes to @@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex); # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +#define MIN_SHARES 2 + static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group = { -#ifdef CONFIG_FAIR_GROUP_SCHED - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - .rt_se = init_sched_rt_entity_p, - .rt_rq = init_rt_rq_p, -#endif -}; +struct task_group init_task_group; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) @@ -297,8 +386,12 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; - /* 'curr' points to currently running entity on this cfs_rq. + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next; @@ -318,6 +411,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -334,6 +464,9 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -396,6 +529,7 @@ struct rq { unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ @@ -405,8 +539,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq) #endif } +#ifdef CONFIG_NO_HZ +static inline bool nohz_on(int cpu) +{ + return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; +} + +static inline u64 max_skipped_ticks(struct rq *rq) +{ + return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; +} + +static inline void update_last_tick_seen(struct rq *rq) +{ + rq->last_tick_seen = jiffies; +} +#else +static inline u64 max_skipped_ticks(struct rq *rq) +{ + return 1; +} + +static inline void update_last_tick_seen(struct rq *rq) +{ +} +#endif + /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: @@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq) /* * Catch too large forward jumps too: */ - if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { - if (clock < rq->tick_timestamp + TICK_NSEC) - clock = rq->tick_timestamp + TICK_NSEC; + u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; + u64 max_time = rq->tick_timestamp + max_jump; + + if (unlikely(clock + delta > max_time)) { + if (clock < max_time) + clock = max_time; else clock++; rq->clock_overflows++; @@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu) /* * Debugging: various feature bits */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + enum { - SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_WAKEUP_PREEMPT = 2, - SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_HRTICK = 8, - SCHED_FEAT_DOUBLE_TICK = 16, +#include "sched_features.h" }; +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | - SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_HRTICK * 1 | - SCHED_FEAT_DOUBLE_TICK * 0; +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +__read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +int sched_feat_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +static ssize_t +sched_feat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; -#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + for (i = 0; sched_feat_names[i]; i++) { + len += strlen(sched_feat_names[i]); + len += 4; + } + + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; sched_feat_names[i]; i++) { + if (sysctl_sched_features & (1UL << i)) + r += sprintf(buf + r, "%s ", sched_feat_names[i]); + else + r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .read = sched_feat_read, + .write = sched_feat_write, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl |