From 7378547f2c83ca16a30d0a7c488a43a688ea0888 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Wed, 24 Oct 2007 18:23:48 +0200 Subject: sched: fix sched_domain sysctl registration again commit 029190c515f15f512ac85de8fc686d4dbd0ae731 (cpuset sched_load_balance flag) was not tested SCHED_DEBUG enabled as committed as it dereferences NULL when used and it reordered the sysctl registration to cause it to never show any domains or their tunables. Fixes: 1) restore arch_init_sched_domains ordering we can't walk the domains before we build them presently we register cpus with empty directories (no domain directories or files). 2) make unregister_sched_domain_sysctl do nothing when already unregistered detach_destroy_domains is now called one set of cpus at a time unregister_syctl dereferences NULL if called with a null. While the the function would always dereference null if called twice, in the previous code it was always called once and then was followed a register. So only the hidden bug of the sysctl_root_table not being allocated followed by an attempt to free it would have shown the error. 3) always call unregister and register in partition_sched_domains The code is "smart" about unregistering only needed domains. Since we aren't guaranteed any calls to unregister, always unregister. Without calling register on the way out we will not have a table or any sysctl tree. 4) warn if register is called without unregistering The previous table memory is lost, leaving pointers to the later freed memory in sysctl and leaking the memory of the tables. Before this patch on a 2-core 4-thread box compiled for SMT and NUMA, the domains appear empty (there are actually 3 levels per cpu). And as soon as two domains a null pointer is dereferenced (unreliable in this case is stack garbage): bu19a:~# ls -R /proc/sys/kernel/sched_domain/ /proc/sys/kernel/sched_domain/: cpu0 cpu1 cpu2 cpu3 /proc/sys/kernel/sched_domain/cpu0: /proc/sys/kernel/sched_domain/cpu1: /proc/sys/kernel/sched_domain/cpu2: /proc/sys/kernel/sched_domain/cpu3: bu19a:~# mkdir /dev/cpuset bu19a:~# mount -tcpuset cpuset /dev/cpuset/ bu19a:~# cd /dev/cpuset/ bu19a:/dev/cpuset# echo 0 > sched_load_balance bu19a:/dev/cpuset# mkdir one bu19a:/dev/cpuset# echo 1 > one/cpus bu19a:/dev/cpuset# echo 0 > one/sched_load_balance Unable to handle kernel paging request for data at address 0x00000018 Faulting instruction address: 0xc00000000006b608 NIP: c00000000006b608 LR: c00000000006b604 CTR: 0000000000000000 REGS: c000000018d973f0 TRAP: 0300 Not tainted (2.6.23-bml) MSR: 9000000000009032 CR: 28242442 XER: 00000000 DAR: 0000000000000018, DSISR: 0000000040000000 TASK = c00000001912e340[1987] 'bash' THREAD: c000000018d94000 CPU: 2 .. NIP [c00000000006b608] .unregister_sysctl_table+0x38/0x110 LR [c00000000006b604] .unregister_sysctl_table+0x34/0x110 Call Trace: [c000000018d97670] [c000000007017270] 0xc000000007017270 (unreliable) [c000000018d97720] [c000000000058710] .detach_destroy_domains+0x30/0xb0 [c000000018d977b0] [c00000000005cf1c] .partition_sched_domains+0x1bc/0x230 [c000000018d97870] [c00000000009fdc4] .rebuild_sched_domains+0xb4/0x4c0 [c000000018d97970] [c0000000000a02e8] .update_flag+0x118/0x170 [c000000018d97a80] [c0000000000a1768] .cpuset_common_file_write+0x568/0x820 [c000000018d97c00] [c00000000009d95c] .cgroup_file_write+0x7c/0x180 [c000000018d97cf0] [c0000000000e76b8] .vfs_write+0xe8/0x1b0 [c000000018d97d90] [c0000000000e810c] .sys_write+0x4c/0x90 [c000000018d97e30] [c00000000000852c] syscall_exit+0x0/0x40 Signed-off-by: Milton Miller Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2810e562a99..e51f0eabfef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5461,11 +5461,12 @@ static void register_sched_domain_sysctl(void) struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + if (entry == NULL) return; - sd_ctl_dir[0].child = entry; - for_each_online_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); @@ -5473,14 +5474,19 @@ static void register_sched_domain_sysctl(void) entry->child = sd_alloc_ctl_cpu_table(i); entry++; } + + WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } +/* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - unregister_sysctl_table(sd_sysctl_header); + if (sd_sysctl_header) + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; - sd_free_ctl_entry(&sd_ctl_dir[0].child); + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); } #else static void register_sched_domain_sysctl(void) @@ -6424,13 +6430,17 @@ static cpumask_t fallback_doms; */ static int arch_init_sched_domains(const cpumask_t *cpu_map) { + int err; + ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) doms_cur = &fallback_doms; cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); - return build_sched_domains(doms_cur); + + return err; } static void arch_destroy_sched_domains(const cpumask_t *cpu_map) @@ -6479,6 +6489,9 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) { int i, j; + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + if (doms_new == NULL) { ndoms_new = 1; doms_new = &fallback_doms; @@ -6514,6 +6527,8 @@ match2: kfree(doms_cur); doms_cur = doms_new; ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -- cgit v1.2.3-70-g09d2 From b15136e9497ef5d6e08cf665e0d0acf7a229f6dc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Oct 2007 18:23:48 +0200 Subject: sched: fix fastcall mismatch in completion APIs Jeff Dike noticed that wait_for_completion_interruptible()'s prototype had a mismatched fastcall. Fix this by removing the fastcall attributes from all the completion APIs. Found-by: Jeff Dike Signed-off-by: Ingo Molnar --- include/linux/completion.h | 18 +++++++++--------- kernel/sched.c | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/completion.h b/include/linux/completion.h index 268c5a4a2bd..33d6aaf9444 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -42,15 +42,15 @@ static inline void init_completion(struct completion *x) init_waitqueue_head(&x->wait); } -extern void FASTCALL(wait_for_completion(struct completion *)); -extern int FASTCALL(wait_for_completion_interruptible(struct completion *x)); -extern unsigned long FASTCALL(wait_for_completion_timeout(struct completion *x, - unsigned long timeout)); -extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout( - struct completion *x, unsigned long timeout)); - -extern void FASTCALL(complete(struct completion *)); -extern void FASTCALL(complete_all(struct completion *)); +extern void wait_for_completion(struct completion *); +extern int wait_for_completion_interruptible(struct completion *x); +extern unsigned long wait_for_completion_timeout(struct completion *x, + unsigned long timeout); +extern unsigned long wait_for_completion_interruptible_timeout( + struct completion *x, unsigned long timeout); + +extern void complete(struct completion *); +extern void complete_all(struct completion *); #define INIT_COMPLETION(x) ((x).done = 0) diff --git a/kernel/sched.c b/kernel/sched.c index e51f0eabfef..80edf29fa27 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3820,7 +3820,7 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ -void fastcall complete(struct completion *x) +void complete(struct completion *x) { unsigned long flags; @@ -3832,7 +3832,7 @@ void fastcall complete(struct completion *x) } EXPORT_SYMBOL(complete); -void fastcall complete_all(struct completion *x) +void complete_all(struct completion *x) { unsigned long flags; @@ -3884,13 +3884,13 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } -void fastcall __sched wait_for_completion(struct completion *x) +void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); -unsigned long fastcall __sched +unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); @@ -3906,7 +3906,7 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); -unsigned long fastcall __sched +unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) { -- cgit v1.2.3-70-g09d2 From 4dcf6aff023d9934630fb3649284951831c51f8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Oct 2007 18:23:48 +0200 Subject: sched: clean up sched_domain_debug() clean up sched_domain_debug(). this also shrinks the code a bit: text data bss dec hex filename 50474 4306 480 55260 d7dc sched.o.before 50404 4306 480 55190 d796 sched.o.after Signed-off-by: Ingo Molnar --- kernel/sched.c | 146 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 73 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 80edf29fa27..af02a4de069 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5617,101 +5617,101 @@ int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); #ifdef CONFIG_SCHED_DEBUG -static void sched_domain_debug(struct sched_domain *sd, int cpu) + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) { - int level = 0; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + char str[NR_CPUS]; - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_CONT "span %s\n", str); + + if (!cpu_isset(cpu, sd->span)) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpu_isset(cpu, group->cpumask)) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { - int i; - char str[NR_CPUS]; - struct sched_group *group = sd->groups; - cpumask_t groupmask; - - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); - - printk(KERN_DEBUG); - for (i = 0; i < level + 1; i++) - printk(" "); - printk("domain %d: ", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); break; } - printk("span %s\n", str); + if (!group->__cpu_power) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); + break; + } - if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + if (!cpus_weight(group->cpumask)) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } - printk(KERN_DEBUG); - for (i = 0; i < level + 2; i++) - printk(" "); - printk("groups:"); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } + if (cpus_intersects(groupmask, group->cpumask)) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } - if (!group->__cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); - break; - } + cpus_or(groupmask, groupmask, group->cpumask); - if (!cpus_weight(group->cpumask)) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); - break; - } + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(KERN_CONT " %s", str); - if (cpus_intersects(groupmask, group->cpumask)) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - break; - } + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); - cpus_or(groupmask, groupmask, group->cpumask); + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - cpumask_scnprintf(str, NR_CPUS, group->cpumask); - printk(KERN_CONT " %s", str); + if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} - group = group->next; - } while (group != sd->groups); - printk(KERN_CONT "\n"); +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; - if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span " - "domain->span\n"); + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level)) + break; level++; sd = sd->parent; if (!sd) - continue; - - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - - } while (sd); + break; + } } #else # define sched_domain_debug(sd, cpu) do { } while (0) -- cgit v1.2.3-70-g09d2 From 838225b48edc971620cbeb292034dabd2b0d7d1d Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Wed, 24 Oct 2007 18:23:50 +0200 Subject: sched: use show_regs() to improve __schedule_bug() output A full register dump along with stack backtrace would make the "scheduling while atomic" message more helpful. Use show_regs() instead of dump_stack() for this. We already know we're atomic in here (that is why this function was called) so show_regs()'s atomicity expectations are guaranteed. Also, modify the output of the "BUG: scheduling while atomic:" header a bit to keep task->comm and task->pid together and preempt_count() after them. Signed-off-by: Satyam Sharma Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index af02a4de069..d1e6663d3ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -66,6 +66,7 @@ #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -3507,12 +3508,19 @@ EXPORT_SYMBOL(sub_preempt_count); */ static noinline void __schedule_bug(struct task_struct *prev) { - printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", - prev->comm, preempt_count(), task_pid_nr(prev)); + struct pt_regs *regs = get_irq_regs(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + debug_show_held_locks(prev); if (irqs_disabled()) print_irqtrace_events(prev); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); } /* -- cgit v1.2.3-70-g09d2 From b3da2a73ff5a2953a4ad8ebbf0aa7e6965ff9de2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 24 Oct 2007 18:23:50 +0200 Subject: sched: document profile=sleep requiring CONFIG_SCHEDSTATS profile=sleep only works if CONFIG_SCHEDSTATS is set. This patch notes the limitation in Documentation/kernel-parameters.txt and prints a warning at boot-time if profile=sleep is used without CONFIG_SCHEDSTAT. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 ++- kernel/profile.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a13d69b2217..8ae5fac08df 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1444,7 +1444,8 @@ and is between 256 and 4096 characters. It is defined in the file Param: "schedule" - profile schedule points. Param: - step/bucket size as a power of 2 for statistical time based profiling. - Param: "sleep" - profile D-state sleeping (millisecs) + Param: "sleep" - profile D-state sleeping (millisecs). + Requires CONFIG_SCHEDSTATS Param: "kvm" - profile VM exits. processor.max_cstate= [HW,ACPI] diff --git a/kernel/profile.c b/kernel/profile.c index 631b75c25d7..5e95330e512 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -60,6 +60,7 @@ static int __init profile_setup(char * str) int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { +#ifdef CONFIG_SCHEDSTATS prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; @@ -68,6 +69,10 @@ static int __init profile_setup(char * str) printk(KERN_INFO "kernel sleep profiling enabled (shift: %ld)\n", prof_shift); +#else + printk(KERN_WARNING + "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); +#endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; if (str[strlen(schedstr)] == ',') -- cgit v1.2.3-70-g09d2 From 2b01dfe37203e825edd8417ad3993d01cbbb527e Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 24 Oct 2007 18:23:50 +0200 Subject: sched: clean up some control group code - replace "cont" with "cgrp" in a few places in the CFS cgroup code, - use write_uint rather than write for cpu.shares write function Signed-off-by: Paul Menage Acked-by : Srivatsa Vaddagiri Signed-off-by: Ingo Molnar --- kernel/sched.c | 53 ++++++++++++++++++----------------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d1e6663d3ab..cc9cd5b710a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7124,25 +7124,25 @@ unsigned long sched_group_shares(struct task_group *tg) #ifdef CONFIG_FAIR_CGROUP_SCHED /* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cont) +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id), - struct task_group, css); + return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + struct task_group, css); } static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct task_group *tg; - if (!cont->parent) { + if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cont; + init_task_group.css.cgroup = cgrp; return &init_task_group.css; } /* we support only 1-level deep hierarchical scheduler atm */ - if (cont->parent->parent) + if (cgrp->parent->parent) return ERR_PTR(-EINVAL); tg = sched_create_group(); @@ -7150,21 +7150,21 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return ERR_PTR(-ENOMEM); /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cont; + tg->css.cgroup = cgrp; return &tg->css; } static void cpu_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) + struct cgroup *cgrp) { - struct task_group *tg = cgroup_tg(cont); + struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) + struct cgroup *cgrp, struct task_struct *tsk) { /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) @@ -7174,38 +7174,21 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, } static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont, +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cont, struct task_struct *tsk) { sched_move_task(tsk); } -static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 shareval) { - unsigned long shareval; - struct task_group *tg = cgroup_tg(cont); - char buffer[2*sizeof(unsigned long) + 1]; - int rc; - - if (nbytes > 2*sizeof(unsigned long)) /* safety check */ - return -E2BIG; - - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - shareval = simple_strtoul(buffer, NULL, 10); - - rc = sched_group_set_shares(tg, shareval); - - return (rc < 0 ? rc : nbytes); + return sched_group_set_shares(cgroup_tg(cgrp), shareval); } -static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft) +static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) { - struct task_group *tg = cgroup_tg(cont); + struct task_group *tg = cgroup_tg(cgrp); return (u64) tg->shares; } @@ -7213,7 +7196,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft) static struct cftype cpu_shares = { .name = "shares", .read_uint = cpu_shares_read_uint, - .write = cpu_shares_write, + .write_uint = cpu_shares_write_uint, }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -- cgit v1.2.3-70-g09d2 From a0f846aa76c3e03d54c1700a87cab3a46ccd71e2 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 24 Oct 2007 18:23:50 +0200 Subject: sched: make cpu_shares_{show,store}() static cpu_shares_{show,store}() can become static. Signed-off-by: Adrian Bunk Signed-off-by: Ingo Molnar --- kernel/user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index e91331c457e..0f3aa023410 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -129,7 +129,7 @@ static inline void uids_mutex_unlock(void) } /* return cpu shares held by the user */ -ssize_t cpu_shares_show(struct kset *kset, char *buffer) +static ssize_t cpu_shares_show(struct kset *kset, char *buffer) { struct user_struct *up = container_of(kset, struct user_struct, kset); @@ -137,7 +137,8 @@ ssize_t cpu_shares_show(struct kset *kset, char *buffer) } /* modify cpu shares held by the user */ -ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) +static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, + size_t size) { struct user_struct *up = container_of(kset, struct user_struct, kset); unsigned long shares; -- cgit v1.2.3-70-g09d2 From e1d1484f72127a5580d37c379f6a5b2c2786434c Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Wed, 24 Oct 2007 18:23:51 +0200 Subject: sched: reduce balance-tasks overhead At the moment, balance_tasks() provides low level functionality for both move_tasks() and move_one_task() (indirectly) via the load_balance() function (in the sched_class interface) which also provides dual functionality. This dual functionality complicates the interfaces and internal mechanisms and makes the run time overhead of operations that are called with two run queue locks held. This patch addresses this issue and reduces the overhead of these operations. Signed-off-by: Peter Williams Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++- kernel/sched.c | 99 +++++++++++++++++++++++++++++++++---------------- kernel/sched_fair.c | 44 ++++++++++++++++------ kernel/sched_idletask.c | 14 +++++-- kernel/sched_rt.c | 28 +++++++++----- 5 files changed, 135 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 52288a64769..639241f4f3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -829,11 +829,14 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, - struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, + struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio); + int (*move_one_task) (struct rq *this_rq, int this_cpu, + struct rq *busiest, struct sched_domain *sd, + enum cpu_idle_type idle); + void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); void (*task_new) (struct rq *rq, struct task_struct *p); diff --git a/kernel/sched.c b/kernel/sched.c index cc9cd5b710a..8607795fad6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -838,11 +838,35 @@ struct rq_iterator { struct task_struct *(*next)(void *); }; -static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved, - int *this_best_prio, struct rq_iterator *iterator); +#ifdef CONFIG_SMP +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator); + +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator); +#else +static inline unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator) +{ + return 0; +} + +static inline int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator) +{ + return 0; +} +#endif #include "sched_stats.h" #include "sched_idletask.c" @@ -2224,17 +2248,17 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, return 1; } -static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved, - int *this_best_prio, struct rq_iterator *iterator) +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator) { int pulled = 0, pinned = 0, skip_for_load; struct task_struct *p; long rem_load_move = max_load_move; - if (max_nr_move == 0 || max_load_move == 0) + if (max_load_move == 0) goto out; pinned = 1; @@ -2267,7 +2291,7 @@ next: * We only want to steal up to the prescribed number of tasks * and the prescribed amount of weighted load. */ - if (pulled < max_nr_move && rem_load_move > 0) { + if (rem_load_move > 0) { if (p->prio < *this_best_prio) *this_best_prio = p->prio; p = iterator->next(iterator->arg); @@ -2275,7 +2299,7 @@ next: } out: /* - * Right now, this is the only place pull_task() is called, + * Right now, this is one of only two places pull_task() is called, * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ @@ -2283,8 +2307,8 @@ out: if (all_pinned) *all_pinned = pinned; - *load_moved = max_load_move - rem_load_move; - return pulled; + + return max_load_move - rem_load_move; } /* @@ -2306,7 +2330,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, do { total_load_moved += class->load_balance(this_rq, this_cpu, busiest, - ULONG_MAX, max_load_move - total_load_moved, + max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); class = class->next; } while (class && max_load_move > total_load_moved); @@ -2314,6 +2338,32 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, return total_load_moved > 0; } +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator) +{ + struct task_struct *p = iterator->start(iterator->arg); + int pinned = 0; + + while (p) { + if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + pull_task(busiest, p, this_rq, this_cpu); + /* + * Right now, this is only the second place pull_task() + * is called, so we can safely collect pull_task() + * stats here rather than inside pull_task(). + */ + schedstat_inc(sd, lb_gained[idle]); + + return 1; + } + p = iterator->next(iterator->arg); + } + + return 0; +} + /* * move_one_task tries to move exactly one task from busiest to this_rq, as * part of active balancing operations within "domain". @@ -2325,12 +2375,9 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle) { const struct sched_class *class; - int this_best_prio = MAX_PRIO; for (class = sched_class_highest; class; class = class->next) - if (class->load_balance(this_rq, this_cpu, busiest, - 1, ULONG_MAX, sd, idle, NULL, - &this_best_prio)) + if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; return 0; @@ -3267,18 +3314,6 @@ static inline void idle_balance(int cpu, struct rq *rq) { } -/* Avoid "used but not defined" warning on UP */ -static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved, - int *this_best_prio, struct rq_iterator *iterator) -{ - *load_moved = 0; - - return 0; -} - #endif DEFINE_PER_CPU(struct kernel_stat, kstat); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 166ed6db600..a90d0457d60 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -936,12 +936,11 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, + unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { struct cfs_rq *busy_cfs_rq; - unsigned long load_moved, total_nr_moved = 0, nr_moved; long rem_load_move = max_load_move; struct rq_iterator cfs_rq_iterator; @@ -969,25 +968,47 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, #else # define maxload rem_load_move #endif - /* pass busy_cfs_rq argument into + /* + * pass busy_cfs_rq argument into * load_balance_[start|next]_fair iterators */ cfs_rq_iterator.arg = busy_cfs_rq; - nr_moved = balance_tasks(this_rq, this_cpu, busiest, - max_nr_move, maxload, sd, idle, all_pinned, - &load_moved, this_best_prio, &cfs_rq_iterator); - - total_nr_moved += nr_moved; - max_nr_move -= nr_moved; - rem_load_move -= load_moved; + rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, + maxload, sd, idle, all_pinned, + this_best_prio, + &cfs_rq_iterator); - if (max_nr_move <= 0 || rem_load_move <= 0) + if (rem_load_move <= 0) break; } return max_load_move - rem_load_move; } +static int +move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct cfs_rq *busy_cfs_rq; + struct rq_iterator cfs_rq_iterator; + + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { + /* + * pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, + &cfs_rq_iterator)) + return 1; + } + + return 0; +} + /* * scheduler tick hitting a task of our scheduling class: */ @@ -1064,6 +1085,7 @@ static const struct sched_class fair_sched_class = { .put_prev_task = put_prev_task_fair, .load_balance = load_balance_fair, + .move_one_task = move_one_task_fair, .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 6e2ead41516..586b06ca30a 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -39,9 +39,16 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) static unsigned long load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return 0; +} + +static int +move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) { return 0; } @@ -70,6 +77,7 @@ const struct sched_class idle_sched_class = { .put_prev_task = put_prev_task_idle, .load_balance = load_balance_idle, + .move_one_task = move_one_task_idle, .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d0097a0634e..e9395b7119e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -172,13 +172,11 @@ static struct task_struct *load_balance_next_rt(void *arg) static unsigned long load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) { - int nr_moved; struct rq_iterator rt_rq_iterator; - unsigned long load_moved; rt_rq_iterator.start = load_balance_start_rt; rt_rq_iterator.next = load_balance_next_rt; @@ -187,11 +185,22 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, */ rt_rq_iterator.arg = busiest; - nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, - max_load_move, sd, idle, all_pinned, &load_moved, - this_best_prio, &rt_rq_iterator); + return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, + idle, all_pinned, this_best_prio, &rt_rq_iterator); +} + +static int +move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct rq_iterator rt_rq_iterator; + + rt_rq_iterator.start = load_balance_start_rt; + rt_rq_iterator.next = load_balance_next_rt; + rt_rq_iterator.arg = busiest; - return load_moved; + return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, + &rt_rq_iterator); } static void task_tick_rt(struct rq *rq, struct task_struct *p) @@ -237,6 +246,7 @@ const struct sched_class rt_sched_class = { .put_prev_task = put_prev_task_rt, .load_balance = load_balance_rt, + .move_one_task = move_one_task_rt, .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, -- cgit v1.2.3-70-g09d2 From 681f3e68541d6f03e3e05d21fe15093578b8b539 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Wed, 24 Oct 2007 18:23:51 +0200 Subject: sched: isolate SMP balancing code a bit more At the moment, a lot of load balancing code that is irrelevant to non SMP systems gets included during non SMP builds. This patch addresses this issue and reduces the binary size on non SMP systems: text data bss dec hex filename 10983 28 1192 12203 2fab sched.o.before 10739 28 1192 11959 2eb7 sched.o.after Signed-off-by: Peter Williams Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 17 ----------------- kernel/sched_fair.c | 4 ++++ kernel/sched_idletask.c | 4 ++++ kernel/sched_rt.c | 4 ++++ 5 files changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 639241f4f3d..24e08d1d900 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -828,6 +828,7 @@ struct sched_class { struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); +#ifdef CONFIG_SMP unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, @@ -836,6 +837,7 @@ struct sched_class { int (*move_one_task) (struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); +#endif void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); diff --git a/kernel/sched.c b/kernel/sched.c index 8607795fad6..b4fbbc44045 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -849,23 +849,6 @@ static int iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle, struct rq_iterator *iterator); -#else -static inline unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator) -{ - return 0; -} - -static inline int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator) -{ - return 0; -} #endif #include "sched_stats.h" diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a90d0457d60..9971831b560 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -876,6 +876,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: */ @@ -1008,6 +1009,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } +#endif /* * scheduler tick hitting a task of our scheduling class: @@ -1084,8 +1086,10 @@ static const struct sched_class fair_sched_class = { .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, +#ifdef CONFIG_SMP .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, +#endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 586b06ca30a..bf9c25c15b8 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -37,6 +37,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } +#ifdef CONFIG_SMP static unsigned long load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -52,6 +53,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, { return 0; } +#endif static void task_tick_idle(struct rq *rq, struct task_struct *curr) { @@ -76,8 +78,10 @@ const struct sched_class idle_sched_class = { .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, +#ifdef CONFIG_SMP .load_balance = load_balance_idle, .move_one_task = move_one_task_idle, +#endif .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e9395b7119e..8abd752a0eb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -98,6 +98,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) p->se.exec_start = 0; } +#ifdef CONFIG_SMP /* * Load-balancing iterator. Note: while the runqueue stays locked * during the whole iteration, the current task might be @@ -202,6 +203,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, &rt_rq_iterator); } +#endif static void task_tick_rt(struct rq *rq, struct task_struct *p) { @@ -245,8 +247,10 @@ const struct sched_class rt_sched_class = { .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, +#ifdef CONFIG_SMP .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, +#endif .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, -- cgit v1.2.3-70-g09d2