diff options
-rw-r--r-- | Documentation/sched-design-CFS.txt | 67 | ||||
-rw-r--r-- | arch/i386/Kconfig | 11 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 10 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 2 | ||||
-rw-r--r-- | fs/pipe.c | 9 | ||||
-rw-r--r-- | fs/proc/array.c | 17 | ||||
-rw-r--r-- | fs/proc/base.c | 2 | ||||
-rw-r--r-- | fs/proc/proc_misc.c | 15 | ||||
-rw-r--r-- | include/linux/kernel_stat.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 99 | ||||
-rw-r--r-- | include/linux/topology.h | 5 | ||||
-rw-r--r-- | init/Kconfig | 21 | ||||
-rw-r--r-- | kernel/delayacct.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/ksysfs.c | 8 | ||||
-rw-r--r-- | kernel/sched.c | 1444 | ||||
-rw-r--r-- | kernel/sched_debug.c | 282 | ||||
-rw-r--r-- | kernel/sched_fair.c | 811 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 8 | ||||
-rw-r--r-- | kernel/sched_rt.c | 19 | ||||
-rw-r--r-- | kernel/sched_stats.h | 28 | ||||
-rw-r--r-- | kernel/sysctl.c | 37 | ||||
-rw-r--r-- | kernel/user.c | 249 | ||||
-rw-r--r-- | net/unix/af_unix.c | 4 |
25 files changed, 1872 insertions, 1288 deletions
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 84901e7c050..88bcb876733 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt @@ -117,3 +117,70 @@ Some implementation details: iterators of the scheduling modules are used. The balancing code got quite a bit simpler as a result. + +Group scheduler extension to CFS +================================ + +Normally the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks +and provide fair CPU time to each such task group. For example, it may +be desirable to first provide fair CPU time to each user on the system +and then to each task belonging to a user. + +CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets +SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such +groups. At present, there are two (mutually exclusive) mechanisms to group +tasks for CPU bandwidth control purpose: + + - Based on user id (CONFIG_FAIR_USER_SCHED) + In this option, tasks are grouped according to their user id. + - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) + This options lets the administrator create arbitrary groups + of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this + filesystem. + +Only one of these options to group tasks can be chosen and not both. + +Group scheduler tunables: + +When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for +each new user and a "cpu_share" file is added in that directory. + + # cd /sys/kernel/uids + # cat 512/cpu_share # Display user 512's CPU share + 1024 + # echo 2048 > 512/cpu_share # Modify user 512's CPU share + # cat 512/cpu_share # Display user 512's CPU share + 2048 + # + +CPU bandwidth between two users are divided in the ratio of their CPU shares. +For ex: if you would like user "root" to get twice the bandwidth of user +"guest", then set the cpu_share for both the users such that "root"'s +cpu_share is twice "guest"'s cpu_share + + +When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created +for each group created using the pseudo filesystem. See example steps +below to create task groups and modify their CPU share using the "cgroups" +pseudo filesystem + + # mkdir /dev/cpuctl + # mount -t cgroup -ocpu none /dev/cpuctl + # cd /dev/cpuctl + + # mkdir multimedia # create "multimedia" group of tasks + # mkdir browser # create "browser" group of tasks + + # #Configure the multimedia group to receive twice the CPU bandwidth + # #that of browser group + + # echo 2048 > multimedia/cpu.shares + # echo 1024 > browser/cpu.shares + + # firefox & # Launch firefox and move it to "browser" group + # echo <firefox_pid> > browser/tasks + + # #Launch gmplayer (or your favourite movie player) + # echo <movie_player_pid> > multimedia/tasks diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f1486f8a3e6..bf9aafad497 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -214,6 +214,17 @@ config X86_ES7000 endchoice +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool "Single-depth WCHAN output" + default y + help + Calculate simpler /proc/<PID>/wchan values. If this option + is disabled then wchan values will recurse back to the + caller function. This provides more accurate wchan values, + at the expense of slightly more scheduling overhead. + + If in doubt, say "Y". + config PARAVIRT bool "Paravirtualization support (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index ad0813843ad..3b0bc4bda5f 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); +static inline void kvm_guest_enter(void) +{ + current->flags |= PF_VCPU; +} + +static inline void kvm_guest_exit(void) +{ + current->flags &= ~PF_VCPU; +} + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 353e58527d1..af2d288c881 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2046,6 +2046,7 @@ again: kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); vcpu->guest_mode = 1; + kvm_guest_enter(); if (vcpu->requests) if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) @@ -2053,6 +2054,7 @@ again: kvm_x86_ops->run(vcpu, kvm_run); + kvm_guest_exit(); vcpu->guest_mode = 0; local_irq_enable(); diff --git a/fs/pipe.c b/fs/pipe.c index 6b3d91a691b..e66ec48e95d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(&pipe->wait, &wait, - TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); @@ -383,7 +382,7 @@ redo: /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) @@ -556,7 +555,7 @@ redo2: out: mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) @@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw) if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } diff --git a/fs/proc/array.c b/fs/proc/array.c index ee4814dd98f..27b59f5f3bd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p) } #endif +static cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + static int do_task_stat(struct task_struct *task, char *buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; @@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; + cputime_t cgtime, gtime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) sigemptyset(&sigign); sigemptyset(&sigcatch); cutime = cstime = utime = stime = cputime_zero; + cgtime = gtime = cputime_zero; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; cstime = sig->cstime; + cgtime = sig->cgtime; rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; /* add up live thread stats at the group level */ @@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) maj_flt += t->maj_flt; utime = cputime_add(utime, task_utime(t)); stime = cputime_add(stime, task_stime(t)); + gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); @@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) maj_flt += sig->maj_flt; utime = cputime_add(utime, sig->utime); stime = cputime_add(stime, sig->stime); + gtime += cputime_add(gtime, sig->gtime); } sid = signal_session(sig); @@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) maj_flt = task->maj_flt; utime = task_utime(task); stime = task_stime(task); + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ @@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", task->pid, tcomm, state, @@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) task_cpu(task), task->rt_priority, task->policy, - (unsigned long long)delayacct_blkio_ticks(task)); + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), + cputime_to_clock_t(cgtime)); if (mm) mmput(mm); return res; diff --git a/fs/proc/base.c b/fs/proc/base.c index 19489b0d555..e5d0953d4db 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) return sprintf(buffer, "%llu %llu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, - task->sched_info.pcnt); + task->sched_info.pcount); } #endif diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index bee251cb87c..b872a01ad3a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v) int i; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest; u64 sum = 0; struct timespec boottime; unsigned int *per_irq_sum; @@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v) user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; + guest = cputime64_zero; getboottime(&boottime); jif = boottime.tv_sec; @@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v) irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for (j = 0; j < NR_IRQS; j++) { unsigned int temp = kstat_cpu(i).irqs[j]; sum += temp; @@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v) } } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v) irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; - seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + guest = kstat_cpu(i).cpustat.guest; + seq_printf(p, + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); } seq_printf(p, "intr %llu", (unsigned long long)sum); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 43e895f1cab..12bf44f083f 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -23,6 +23,7 @@ struct cpu_usage_stat { cputime64_t idle; cputime64_t iowait; cputime64_t steal; + cputime64_t guest; }; struct kernel_stat { diff --git a/include/linux/sched.h b/include/linux/sched.h index 833f7dc2b8d..228e0a8ce24 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -87,6 +87,7 @@ struct sched_param { #include <linux/timer.h> #include <linux/hrtimer.h> #include <linux/task_io_accounting.h> +#include <linux/kobject.h> #include <asm/processor.h> @@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; +struct task_group; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); @@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_DEAD 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -516,6 +517,8 @@ struct signal_struct { * in __exit_signal, except for the group leader. */ cputime_t utime, stime, cutime, cstime; + cputime_t gtime; + cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -596,8 +599,21 @@ struct user_struct { /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + +#ifdef CONFIG_FAIR_USER_SCHED + struct task_group *tg; + struct kset kset; + struct subsys_attribute user_attr; + struct work_struct work; +#endif }; +#ifdef CONFIG_FAIR_USER_SCHED +extern int uids_kobject_init(void); +#else +static inline int uids_kobject_init(void) { return 0; } +#endif + extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; @@ -609,13 +625,17 @@ struct reclaim_state; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info { /* cumulative counters */ - unsigned long pcnt; /* # of times run on this cpu */ + unsigned long pcount; /* # of times run on this cpu */ unsigned long long cpu_time, /* time spent on the cpu */ run_delay; /* time spent waiting on a runqueue */ /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ +#ifdef CONFIG_SCHEDSTATS + /* BKL stats */ + unsigned long bkl_count; +#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -750,7 +770,7 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; + unsigned long lb_count[CPU_MAX_IDLE_TYPES]; unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; @@ -760,17 +780,17 @@ struct sched_domain { unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ - unsigned long alb_cnt; + unsigned long alb_count; unsigned long alb_failed; unsigned long alb_pushed; /* SD_BALANCE_EXEC stats */ - unsigned long sbe_cnt; + unsigned long sbe_count; unsigned long sbe_balanced; unsigned long sbe_pushed; /* SD_BALANCE_FORK stats */ - unsigned long sbf_cnt; + unsigned long sbf_count; unsigned long sbf_balanced; unsigned long sbf_pushed; @@ -854,11 +874,11 @@ struct rq; struct sched_domain; struct sched_class { - struct sched_class *next; + const struct sched_class *next; void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); - void (*yield_task) (struct rq *rq, struct task_struct *p); + void (*yield_task) (struct rq *rq); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); @@ -888,31 +908,22 @@ struct load_weight { * 4 se->block_start * 4 se->run_node * 4 se->sleep_start - * 4 se->sleep_start_fair * 6 se->load.weight - * 7 se->delta_fair - * 15 se->wait_runtime */ struct sched_entity { - long wait_runtime; - unsigned long delta_fair_run; - unsigned long delta_fair_sleep; - unsigned long delta_exec; - s64 fair_key; struct load_weight load; /* for load-balancing */ struct rb_node run_node; unsigned int on_rq; + int peer_preempt; u64 exec_start; u64 sum_exec_runtime; + u64 vruntime; u64 prev_sum_exec_runtime; - u64 wait_start_fair; - u64 sleep_start_fair; #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; - s64 sum_wait_runtime; u64 sleep_start; u64 sleep_max; @@ -921,9 +932,25 @@ struct sched_entity { u64 block_start; u64 block_max; u64 exec_max; - - unsigned long wait_runtime_overruns; - unsigned long wait_runtime_underruns; + u64 slice_max; + + u64 nr_migrations; + u64 nr_migrations_cold; + u64 nr_failed_migrations_affine; + u64 nr_failed_migrations_running; + u64 nr_failed_migrations_hot; + u64 nr_forced_migrations; + u64 nr_forced2_migrations; + + u64 nr_wakeups; + u64 nr_wakeups_sync; + u64 nr_wakeups_migrate; + u64 nr_wakeups_local; + u64 nr_wakeups_remote; + u64 nr_wakeups_affine; + u64 nr_wakeups_affine_attempts; + u64 nr_wakeups_passive; + u64 nr_wakeups_idle; #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -952,7 +979,7 @@ struct task_struct { int prio, static_prio, normal_prio; struct list_head run_list; - struct sched_class *sched_class; + const struct sched_class *sched_class; struct sched_entity se; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1023,6 +1050,7 @@ struct task_struct { unsigned int rt_priority; cputime_t utime, stime; + cputime_t gtime; unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ struct timespec real_start_time; /* boot based time */ @@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ +#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); +#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_nr_latency; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; -extern unsigned int sysctl_sched_stat_granularity; -extern unsigned int sysctl_sched_runtime_limit; -extern unsigned int sysctl_sched_compat_yield; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; +extern unsigned int sysctl_sched_migration_cost; +#endif + +extern unsigned int sysctl_sched_compat_yield; #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); @@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings; extern void normalize_rt_tasks(void); +#ifdef CONFIG_FAIR_GROUP_SCHED + +extern struct task_group init_task_group; + +extern struct task_group *sched_create_group(void); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_move_task(struct task_struct *tsk); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern unsigned long sched_group_shares(struct task_group *tg); + +#endif + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/include/linux/topology.h b/include/linux/topology.h index 525d437b125..47729f18bfd 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -159,15 +159,14 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ | BALANCE_FOR_PKG_POWER,\ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/init/Kconfig b/init/Kconfig index d54d0cadcc0..54f31a191b8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -281,6 +281,27 @@ config CPUSETS Say N if unsure. +config FAIR_GROUP_SCHED + bool "Fair group CPU scheduler" + default y + depends on EXPERIMENTAL + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. + +choice + depends on FAIR_GROUP_SCHED + prompt "Basis for grouping tasks" + default FAIR_USER_SCHED + +config FAIR_USER_SCHED + bool "user id" + help + This option will choose userid as the basis for grouping + tasks, thus providing equal CPU bandwidth to each user. + +endchoice + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 81e69782963..09e9574eeb2 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) * No locking available for sched_info (and too expensive to add one) * Mitigate by taking snapshot of values */ - t1 = tsk->sched_info.pcnt; + t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; t3 = tsk->sched_info.cpu_time; diff --git a/kernel/exit.c b/kernel/exit.c index 993369ee94d..7f7959de4a8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk) */ sig->utime = cputime_add(sig->utime, tsk->utime); sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, cputime_add(p->stime, cputime_add(sig->stime, sig->cstime))); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += diff --git a/kernel/fork.c b/kernel/fork.c index 5e67f90a169..3fc3c138391 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->tty_old_pgrp = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->gtime = cputime_zero; + sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; @@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = cputime_zero; p->stime = cputime_zero; + p->gtime = cputime_zero; #ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48e18c..6046939d080 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/kexec.h> +#include <linux/sched.h> #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -116,6 +117,13 @@ static int __init ksysfs_init(void) ¬es_attr); } + /* + * Create "/sys/kernel/uids" directory and corresponding root user's + * directory under it. + */ + if (!error) + error = uids_kobject_init(); + return error; } diff --git a/kernel/sched.c b/kernel/sched.c index 6c10fa796ca..bba57adb950 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) /* * Some helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define NICE_0_LOAD SCHED_LOAD_SCALE @@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) /* * These are the 'tuning knobs' of the scheduler: * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) #ifdef CONFIG_SMP @@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) } #endif -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -/* - * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - */ -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio == NICE_TO_PRIO(19)) - return 1; - - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -171,31 +151,91 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -struct load_stat { - struct load_weight load; - u64 load_update_start, load_update_last; - unsigned long delta_fair, delta_exec, delta_stat; +#ifdef CONFIG_FAIR_GROUP_SCHED + +struct cfs_rq; + +/* task group related information */ +struct task_group { + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + /* spinlock to serialize modification to shares */ + spinlock_t lock; +}; + +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; + +static struct sched_entity *init_sched_entity_p[NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group init_task_group = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, }; +#ifdef CONFIG_FAIR_USER_SCHED +# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD +#else +# define INIT_TASK_GRP_LOAD NICE_0_LOAD +#endif + +static int init_task_group_load = INIT_TASK_GRP_LOAD; + +/* return group to which a task belongs */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + +#ifdef CONFIG_FAIR_USER_SCHED + tg = p->user->tg; +#else + tg = &init_task_group; +#endif + + return tg; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; + p->se.parent = task_group(p)->se[task_cpu(p)]; +} + +#else + +static inline void set_task_cfs_rq(struct task_struct *p) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; unsigned long nr_running; - s64 fair_clock; u64 exec_clock; - s64 wait_runtime; - u64 sleeper_bonus; - unsigned long wait_runtime_overruns, wait_runtime_underruns; + u64 min_vruntime; struct rb_root tasks_ |