aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/sched-design-CFS.txt67
-rw-r--r--arch/i386/Kconfig11
-rw-r--r--drivers/kvm/kvm.h10
-rw-r--r--drivers/kvm/kvm_main.c2
-rw-r--r--fs/pipe.c9
-rw-r--r--fs/proc/array.c17
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/proc_misc.c15
-rw-r--r--include/linux/kernel_stat.h1
-rw-r--r--include/linux/sched.h99
-rw-r--r--include/linux/topology.h5
-rw-r--r--init/Kconfig21
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/sched.c1444
-rw-r--r--kernel/sched_debug.c282
-rw-r--r--kernel/sched_fair.c811
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stats.h28
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/user.c249
-rw-r--r--net/unix/af_unix.c4
25 files changed, 1872 insertions, 1288 deletions
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 84901e7c050..88bcb876733 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
iterators of the scheduling modules are used. The balancing code got
quite a bit simpler as a result.
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+ - Based on user id (CONFIG_FAIR_USER_SCHED)
+ In this option, tasks are grouped according to their user id.
+ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+ This options lets the administrator create arbitrary groups
+ of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this
+ filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+ # cd /sys/kernel/uids
+ # cat 512/cpu_share # Display user 512's CPU share
+ 1024
+ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
+ # cat 512/cpu_share # Display user 512's CPU share
+ 2048
+ #
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+ # mkdir /dev/cpuctl
+ # mount -t cgroup -ocpu none /dev/cpuctl
+ # cd /dev/cpuctl
+
+ # mkdir multimedia # create "multimedia" group of tasks
+ # mkdir browser # create "browser" group of tasks
+
+ # #Configure the multimedia group to receive twice the CPU bandwidth
+ # #that of browser group
+
+ # echo 2048 > multimedia/cpu.shares
+ # echo 1024 > browser/cpu.shares
+
+ # firefox & # Launch firefox and move it to "browser" group
+ # echo <firefox_pid> > browser/tasks
+
+ # #Launch gmplayer (or your favourite movie player)
+ # echo <movie_player_pid> > multimedia/tasks
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f1486f8a3e6..bf9aafad497 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -214,6 +214,17 @@ config X86_ES7000
endchoice
+config SCHED_NO_NO_OMIT_FRAME_POINTER
+ bool "Single-depth WCHAN output"
+ default y
+ help
+ Calculate simpler /proc/<PID>/wchan values. If this option
+ is disabled then wchan values will recurse back to the
+ caller function. This provides more accurate wchan values,
+ at the expense of slightly more scheduling overhead.
+
+ If in doubt, say "Y".
+
config PARAVIRT
bool "Paravirtualization support (EXPERIMENTAL)"
depends on EXPERIMENTAL
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index ad0813843ad..3b0bc4bda5f 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+static inline void kvm_guest_enter(void)
+{
+ current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+ current->flags &= ~PF_VCPU;
+}
+
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 353e58527d1..af2d288c881 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2046,6 +2046,7 @@ again:
kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
vcpu->guest_mode = 1;
+ kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
@@ -2053,6 +2054,7 @@ again:
kvm_x86_ops->run(vcpu, kvm_run);
+ kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
diff --git a/fs/pipe.c b/fs/pipe.c
index 6b3d91a691b..e66ec48e95d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe)
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
- prepare_to_wait(&pipe->wait, &wait,
- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
+ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
if (pipe->inode)
mutex_unlock(&pipe->inode->i_mutex);
schedule();
@@ -383,7 +382,7 @@ redo:
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
@@ -556,7 +555,7 @@ redo2:
out:
mutex_unlock(&inode->i_mutex);
if (do_wakeup) {
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0)
@@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw)
if (!pipe->readers && !pipe->writers) {
free_pipe_info(inode);
} else {
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ee4814dd98f..27b59f5f3bd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p)
}
#endif
+static cputime_t task_gtime(struct task_struct *p)
+{
+ return p->gtime;
+}
+
static int do_task_stat(struct task_struct *task, char *buffer, int whole)
{
unsigned long vsize, eip, esp, wchan = ~0UL;
@@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0;
cputime_t cutime, cstime, utime, stime;
+ cputime_t cgtime, gtime;
unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)];
unsigned long flags;
@@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = cputime_zero;
+ cgtime = gtime = cputime_zero;
rcu_read_lock();
if (lock_task_sighand(task, &flags)) {
@@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
cmaj_flt = sig->cmaj_flt;
cutime = sig->cutime;
cstime = sig->cstime;
+ cgtime = sig->cgtime;
rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
/* add up live thread stats at the group level */
@@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt += t->maj_flt;
utime = cputime_add(utime, task_utime(t));
stime = cputime_add(stime, task_stime(t));
+ gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);
@@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt += sig->maj_flt;
utime = cputime_add(utime, sig->utime);
stime = cputime_add(stime, sig->stime);
+ gtime += cputime_add(gtime, sig->gtime);
}
sid = signal_session(sig);
@@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt = task->maj_flt;
utime = task_utime(task);
stime = task_stime(task);
+ gtime = task_gtime(task);
}
/* scale priority and nice values from timeslices to -20..20 */
@@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
task->pid,
tcomm,
state,
@@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
task_cpu(task),
task->rt_priority,
task->policy,
- (unsigned long long)delayacct_blkio_ticks(task));
+ (unsigned long long)delayacct_blkio_ticks(task),
+ cputime_to_clock_t(gtime),
+ cputime_to_clock_t(cgtime));
if (mm)
mmput(mm);
return res;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 19489b0d555..e5d0953d4db 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
return sprintf(buffer, "%llu %llu %lu\n",
task->sched_info.cpu_time,
task->sched_info.run_delay,
- task->sched_info.pcnt);
+ task->sched_info.pcount);
}
#endif
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index bee251cb87c..b872a01ad3a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v)
int i;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+ cputime64_t guest;
u64 sum = 0;
struct timespec boottime;
unsigned int *per_irq_sum;
@@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v)
user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
+ guest = cputime64_zero;
getboottime(&boottime);
jif = boottime.tv_sec;
@@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v)
irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+ guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
for (j = 0; j < NR_IRQS; j++) {
unsigned int temp = kstat_cpu(i).irqs[j];
sum += temp;
@@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v)
}
}
- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
(unsigned long long)cputime64_to_clock_t(system),
@@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal));
+ (unsigned long long)cputime64_to_clock_t(steal),
+ (unsigned long long)cputime64_to_clock_t(guest));
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v)
irq = kstat_cpu(i).cpustat.irq;
softirq = kstat_cpu(i).cpustat.softirq;
steal = kstat_cpu(i).cpustat.steal;
- seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ guest = kstat_cpu(i).cpustat.guest;
+ seq_printf(p,
+ "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
i,
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
@@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal));
+ (unsigned long long)cputime64_to_clock_t(steal),
+ (unsigned long long)cputime64_to_clock_t(guest));
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 43e895f1cab..12bf44f083f 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -23,6 +23,7 @@ struct cpu_usage_stat {
cputime64_t idle;
cputime64_t iowait;
cputime64_t steal;
+ cputime64_t guest;
};
struct kernel_stat {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 833f7dc2b8d..228e0a8ce24 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -87,6 +87,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
+#include <linux/kobject.h>
#include <asm/processor.h>
@@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu);
struct seq_file;
struct cfs_rq;
+struct task_group;
#ifdef CONFIG_SCHED_DEBUG
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
@@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#define EXIT_ZOMBIE 16
#define EXIT_DEAD 32
/* in tsk->state again */
-#define TASK_NONINTERACTIVE 64
-#define TASK_DEAD 128
+#define TASK_DEAD 64
#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
@@ -516,6 +517,8 @@ struct signal_struct {
* in __exit_signal, except for the group leader.
*/
cputime_t utime, stime, cutime, cstime;
+ cputime_t gtime;
+ cputime_t cgtime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
@@ -596,8 +599,21 @@ struct user_struct {
/* Hash table maintenance information */
struct hlist_node uidhash_node;
uid_t uid;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+ struct task_group *tg;
+ struct kset kset;
+ struct subsys_attribute user_attr;
+ struct work_struct work;
+#endif
};
+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user;
@@ -609,13 +625,17 @@ struct reclaim_state;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info {
/* cumulative counters */
- unsigned long pcnt; /* # of times run on this cpu */
+ unsigned long pcount; /* # of times run on this cpu */
unsigned long long cpu_time, /* time spent on the cpu */
run_delay; /* time spent waiting on a runqueue */
/* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */
last_queued; /* when we were last queued to run */
+#ifdef CONFIG_SCHEDSTATS
+ /* BKL stats */
+ unsigned long bkl_count;
+#endif
};
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
@@ -750,7 +770,7 @@ struct sched_domain {
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
- unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+ unsigned long lb_count[CPU_MAX_IDLE_TYPES];
unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
@@ -760,17 +780,17 @@ struct sched_domain {
unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
- unsigned long alb_cnt;
+ unsigned long alb_count;
unsigned long alb_failed;
unsigned long alb_pushed;
/* SD_BALANCE_EXEC stats */
- unsigned long sbe_cnt;
+ unsigned long sbe_count;
unsigned long sbe_balanced;
unsigned long sbe_pushed;
/* SD_BALANCE_FORK stats */
- unsigned long sbf_cnt;
+ unsigned long sbf_count;
unsigned long sbf_balanced;
unsigned long sbf_pushed;
@@ -854,11 +874,11 @@ struct rq;
struct sched_domain;
struct sched_class {
- struct sched_class *next;
+ const struct sched_class *next;
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
- void (*yield_task) (struct rq *rq, struct task_struct *p);
+ void (*yield_task) (struct rq *rq);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
@@ -888,31 +908,22 @@ struct load_weight {
* 4 se->block_start
* 4 se->run_node
* 4 se->sleep_start
- * 4 se->sleep_start_fair
* 6 se->load.weight
- * 7 se->delta_fair
- * 15 se->wait_runtime
*/
struct sched_entity {
- long wait_runtime;
- unsigned long delta_fair_run;
- unsigned long delta_fair_sleep;
- unsigned long delta_exec;
- s64 fair_key;
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
unsigned int on_rq;
+ int peer_preempt;
u64 exec_start;
u64 sum_exec_runtime;
+ u64 vruntime;
u64 prev_sum_exec_runtime;
- u64 wait_start_fair;
- u64 sleep_start_fair;
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
- s64 sum_wait_runtime;
u64 sleep_start;
u64 sleep_max;
@@ -921,9 +932,25 @@ struct sched_entity {
u64 block_start;
u64 block_max;
u64 exec_max;
-
- unsigned long wait_runtime_overruns;
- unsigned long wait_runtime_underruns;
+ u64 slice_max;
+
+ u64 nr_migrations;
+ u64 nr_migrations_cold;
+ u64 nr_failed_migrations_affine;
+ u64 nr_failed_migrations_running;
+ u64 nr_failed_migrations_hot;
+ u64 nr_forced_migrations;
+ u64 nr_forced2_migrations;
+
+ u64 nr_wakeups;
+ u64 nr_wakeups_sync;
+ u64 nr_wakeups_migrate;
+ u64 nr_wakeups_local;
+ u64 nr_wakeups_remote;
+ u64 nr_wakeups_affine;
+ u64 nr_wakeups_affine_attempts;
+ u64 nr_wakeups_passive;
+ u64 nr_wakeups_idle;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -952,7 +979,7 @@ struct task_struct {
int prio, static_prio, normal_prio;
struct list_head run_list;
- struct sched_class *sched_class;
+ const struct sched_class *sched_class;
struct sched_entity se;
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1023,6 +1050,7 @@ struct task_struct {
unsigned int rt_priority;
cputime_t utime, stime;
+ cputime_t gtime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
@@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t)
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
+#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
@@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {}
extern void sched_idle_next(void);
+#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_nr_latency;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_batch_wakeup_granularity;
-extern unsigned int sysctl_sched_stat_granularity;
-extern unsigned int sysctl_sched_runtime_limit;
-extern unsigned int sysctl_sched_compat_yield;
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
+extern unsigned int sysctl_sched_migration_cost;
+#endif
+
+extern unsigned int sysctl_sched_compat_yield;
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
@@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
extern void normalize_rt_tasks(void);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+extern struct task_group init_task_group;
+
+extern struct task_group *sched_create_group(void);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_move_task(struct task_struct *tsk);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
+
+#endif
+
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 525d437b125..47729f18bfd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -159,15 +159,14 @@
.imbalance_pct = 125, \
.cache_nice_tries = 1, \
.busy_idx = 2, \
- .idle_idx = 0, \
- .newidle_idx = 0, \
+ .idle_idx = 1, \
+ .newidle_idx = 2, \
.wake_idx = 1, \
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
- | SD_WAKE_IDLE \
| BALANCE_FOR_PKG_POWER,\
.last_balance = jiffies, \
.balance_interval = 1, \
diff --git a/init/Kconfig b/init/Kconfig
index d54d0cadcc0..54f31a191b8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -281,6 +281,27 @@ config CPUSETS
Say N if unsure.
+config FAIR_GROUP_SCHED
+ bool "Fair group CPU scheduler"
+ default y
+ depends on EXPERIMENTAL
+ help
+ This feature lets CPU scheduler recognize task groups and control CPU
+ bandwidth allocation to such task groups.
+
+choice
+ depends on FAIR_GROUP_SCHED
+ prompt "Basis for grouping tasks"
+ default FAIR_USER_SCHED
+
+config FAIR_USER_SCHED
+ bool "user id"
+ help
+ This option will choose userid as the basis for grouping
+ tasks, thus providing equal CPU bandwidth to each user.
+
+endchoice
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 81e69782963..09e9574eeb2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
- t1 = tsk->sched_info.pcnt;
+ t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->sched_info.cpu_time;
diff --git a/kernel/exit.c b/kernel/exit.c
index 993369ee94d..7f7959de4a8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
*/
sig->utime = cputime_add(sig->utime, tsk->utime);
sig->stime = cputime_add(sig->stime, tsk->stime);
+ sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
cputime_add(p->stime,
cputime_add(sig->stime,
sig->cstime)));
+ psig->cgtime =
+ cputime_add(psig->cgtime,
+ cputime_add(p->gtime,
+ cputime_add(sig->gtime,
+ sig->cgtime)));
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e67f90a169..3fc3c138391 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
sig->tty_old_pgrp = NULL;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ sig->gtime = cputime_zero;
+ sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = cputime_zero;
p->stime = cputime_zero;
+ p->gtime = cputime_zero;
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48e18c..6046939d080 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
+#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
&notes_attr);
}
+ /*
+ * Create "/sys/kernel/uids" directory and corresponding root user's
+ * directory under it.
+ */
+ if (!error)
+ error = uids_kobject_init();
+
return error;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 6c10fa796ca..bba57adb950 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/*
* Some helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
#define NICE_0_LOAD SCHED_LOAD_SCALE
@@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
* Timeslices get refilled after they expire.
*/
-#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
#ifdef CONFIG_SMP
@@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
}
#endif
-#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-/*
- * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- */
-static unsigned int static_prio_timeslice(int static_prio)
-{
- if (static_prio == NICE_TO_PRIO(19))
- return 1;
-
- if (static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
- else
- return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -171,31 +151,91 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
-struct load_stat {
- struct load_weight load;
- u64 load_update_start, load_update_last;
- unsigned long delta_fair, delta_exec, delta_stat;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_group {
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+ /* spinlock to serialize modification to shares */
+ spinlock_t lock;
+};
+
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+
+/* Default task group.
+ * Every task in system belong to this group at bootup.
+ */
+struct task_group init_task_group = {
+ .se = init_sched_entity_p,
+ .cfs_rq = init_cfs_rq_p,
};
+#ifdef CONFIG_FAIR_USER_SCHED
+# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
+#else
+# define INIT_TASK_GRP_LOAD NICE_0_LOAD
+#endif
+
+static int init_task_group_load = INIT_TASK_GRP_LOAD;
+
+/* return group to which a task belongs */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct task_group *tg;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+ tg = p->user->tg;
+#else
+ tg = &init_task_group;
+#endif
+
+ return tg;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+ p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
+ p->se.parent = task_group(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
- s64 fair_clock;
u64 exec_clock;
- s64 wait_runtime;
- u64 sleeper_bonus;
- unsigned long wait_runtime_overruns, wait_runtime_underruns;
+ u64 min_vruntime;
struct rb_root tasks_