aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-bwc.txt122
-rw-r--r--drivers/acpi/apei/Kconfig1
-rw-r--r--include/linux/irq_work.h15
-rw-r--r--include/linux/llist.h77
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/trace/events/sched.h9
-rw-r--r--init/Kconfig12
-rw-r--r--kernel/irq_work.c91
-rw-r--r--kernel/sched.c666
-rw-r--r--kernel/sched_cpupri.c89
-rw-r--r--kernel/sched_cpupri.h7
-rw-r--r--kernel/sched_fair.c761
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c99
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile4
-rw-r--r--lib/llist.c74
-rw-r--r--lib/smp_processor_id.c2
20 files changed, 1646 insertions, 410 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
new file mode 100644
index 00000000000..f6b1873f68a
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt
@@ -0,0 +1,122 @@
+CFS Bandwidth Control
+=====================
+
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
+ The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
+
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
+specification of the maximum CPU bandwidth available to a group or hierarchy.
+
+The bandwidth allowed for a group is specified using a quota and period. Within
+each given "period" (microseconds), a group is allowed to consume only up to
+"quota" microseconds of CPU time. When the CPU bandwidth consumption of a
+group exceeds this limit (for that period), the tasks belonging to its
+hierarchy will be throttled and are not allowed to run again until the next
+period.
+
+A group's unused runtime is globally tracked, being refreshed with quota units
+above at each period boundary. As threads consume this bandwidth it is
+transferred to cpu-local "silos" on a demand basis. The amount transferred
+within each of these updates is tunable and described as the "slice".
+
+Management
+----------
+Quota and period are managed within the cpu subsystem via cgroupfs.
+
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
+cpu.cfs_period_us: the length of a period (in microseconds)
+cpu.stat: exports throttling statistics [explained further below]
+
+The default values are:
+ cpu.cfs_period_us=100ms
+ cpu.cfs_quota=-1
+
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
+bandwidth restriction in place, such a group is described as an unconstrained
+bandwidth group. This represents the traditional work-conserving behavior for
+CFS.
+
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
+The minimum quota allowed for the quota or period is 1ms. There is also an
+upper bound on the period length of 1s. Additional restrictions exist when
+bandwidth limits are used in a hierarchical fashion, these are explained in
+more detail below.
+
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
+and return the group to an unconstrained state once more.
+
+Any updates to a group's bandwidth specification will result in it becoming
+unthrottled if it is in a constrained state.
+
+System wide settings
+--------------------
+For efficiency run-time is transferred between the global pool and CPU local
+"silos" in a batch fashion. This greatly reduces global accounting pressure
+on large systems. The amount transferred each time such an update is required
+is described as the "slice".
+
+This is tunable via procfs:
+ /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
+
+Larger slice values will reduce transfer overheads, while smaller values allow
+for more fine-grained consumption.
+
+Statistics
+----------
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
+
+cpu.stat:
+- nr_periods: Number of enforcement intervals that have elapsed.
+- nr_throttled: Number of times the group has been throttled/limited.
+- throttled_time: The total time duration (in nanoseconds) for which entities
+ of the group have been throttled.
+
+This interface is read-only.
+
+Hierarchical considerations
+---------------------------
+The interface enforces that an individual entity's bandwidth is always
+attainable, that is: max(c_i) <= C. However, over-subscription in the
+aggregate case is explicitly allowed to enable work-conserving semantics
+within a hierarchy.
+ e.g. \Sum (c_i) may exceed C
+[ Where C is the parent's bandwidth, and c_i its children ]
+
+
+There are two ways in which a group may become throttled:
+ a. it fully consumes its own quota within a period
+ b. a parent's quota is fully consumed within its period
+
+In case b) above, even though the child may have runtime remaining it will not
+be allowed to until the parent's runtime is refreshed.
+
+Examples
+--------
+1. Limit a group to 1 CPU worth of runtime.
+
+ If period is 250ms and quota is also 250ms, the group will get
+ 1 CPU worth of runtime every 250ms.
+
+ # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
+ # echo 250000 > cpu.cfs_period_us /* period = 250ms */
+
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
+
+ With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
+ runtime every 500ms.
+
+ # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
+ # echo 500000 > cpu.cfs_period_us /* period = 500ms */
+
+ The larger period here allows for increased burst capacity.
+
+3. Limit a group to 20% of 1 CPU.
+
+ With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
+
+ # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
+ # echo 50000 > cpu.cfs_period_us /* period = 50ms */
+
+ By using a small period here we are ensuring a consistent latency
+ response at the expense of burst capacity.
+
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index e3f47872ec2..f0c1ce95a0e 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -14,7 +14,6 @@ config ACPI_APEI_GHES
depends on ACPI_APEI && X86
select ACPI_HED
select IRQ_WORK
- select LLIST
select GENERIC_ALLOCATOR
help
Generic Hardware Error Source provides a way to report
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 4fa09d4d0b7..6a9e8f5399e 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -1,20 +1,23 @@
#ifndef _LINUX_IRQ_WORK_H
#define _LINUX_IRQ_WORK_H
+#include <linux/llist.h>
+
struct irq_work {
- struct irq_work *next;
+ unsigned long flags;
+ struct llist_node llnode;
void (*func)(struct irq_work *);
};
static inline
-void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
{
- entry->next = NULL;
- entry->func = func;
+ work->flags = 0;
+ work->func = func;
}
-bool irq_work_queue(struct irq_work *entry);
+bool irq_work_queue(struct irq_work *work);
void irq_work_run(void);
-void irq_work_sync(struct irq_work *entry);
+void irq_work_sync(struct irq_work *work);
#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/llist.h b/include/linux/llist.h
index aa0c8b5b3cd..7287734e08d 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -35,10 +35,30 @@
*
* The basic atomic operation of this list is cmpxchg on long. On
* architectures that don't have NMI-safe cmpxchg implementation, the
- * list can NOT be used in NMI handler. So code uses the list in NMI
- * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ * list can NOT be used in NMI handlers. So code that uses the list in
+ * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010,2011 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+
struct llist_head {
struct llist_node *first;
};
@@ -113,14 +133,55 @@ static inline void init_llist_head(struct llist_head *list)
* test whether the list is empty without deleting something from the
* list.
*/
-static inline int llist_empty(const struct llist_head *head)
+static inline bool llist_empty(const struct llist_head *head)
{
return ACCESS_ONCE(head->first) == NULL;
}
-void llist_add(struct llist_node *new, struct llist_head *head);
-void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
- struct llist_head *head);
-struct llist_node *llist_del_first(struct llist_head *head);
-struct llist_node *llist_del_all(struct llist_head *head);
+static inline struct llist_node *llist_next(struct llist_node *node)
+{
+ return node->next;
+}
+
+/**
+ * llist_add - add a new entry
+ * @new: new entry to be added
+ * @head: the head for your lock-less list
+ *
+ * Return whether list is empty before adding.
+ */
+static inline bool llist_add(struct llist_node *new, struct llist_head *head)
+{
+ struct llist_node *entry, *old_entry;
+
+ entry = head->first;
+ for (;;) {
+ old_entry = entry;
+ new->next = entry;
+ entry = cmpxchg(&head->first, old_entry, new);
+ if (entry == old_entry)
+ break;
+ }
+
+ return old_entry == NULL;
+}
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head: the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry. The order of entries
+ * deleted is from the newest to the oldest added one.
+ */
+static inline struct llist_node *llist_del_all(struct llist_head *head)
+{
+ return xchg(&head->first, NULL);
+}
+
+extern bool llist_add_batch(struct llist_node *new_first,
+ struct llist_node *new_last,
+ struct llist_head *head);
+extern struct llist_node *llist_del_first(struct llist_head *head);
+
#endif /* LLIST_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ede8a6585e3..e8acce717d2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -90,6 +90,7 @@ struct sched_param {
#include <linux/task_io_accounting.h>
#include <linux/latencytop.h>
#include <linux/cred.h>
+#include <linux/llist.h>
#include <asm/processor.h>
@@ -1224,7 +1225,7 @@ struct task_struct {
unsigned int ptrace;
#ifdef CONFIG_SMP
- struct task_struct *wake_entry;
+ struct llist_node wake_entry;
int on_cpu;
#endif
int on_rq;
@@ -2035,6 +2036,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f6334782a59..959ff18b63b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
* For all intents and purposes a preempted task is a running task.
*/
if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
- state = TASK_RUNNING;
+ state = TASK_RUNNING | TASK_STATE_MAX;
#endif
return state;
@@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch,
__entry->next_prio = next->prio;
),
- TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d",
+ TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
- __entry->prev_state ?
- __print_flags(__entry->prev_state, "|",
+ __entry->prev_state & (TASK_STATE_MAX-1) ?
+ __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
{ 16, "Z" }, { 32, "X" }, { 64, "x" },
{ 128, "W" }) : "R",
+ __entry->prev_state & TASK_STATE_MAX ? "+" : "",
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);
diff --git a/init/Kconfig b/init/Kconfig
index dc7e27bf89a..31ba0fd0f36 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
depends on CGROUP_SCHED
default CGROUP_SCHED
+config CFS_BANDWIDTH
+ bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+ depends on EXPERIMENTAL
+ depends on FAIR_GROUP_SCHED
+ default n
+ help
+ This option allows users to define CPU bandwidth rates (limits) for
+ tasks running within the fair group scheduler. Groups with no limit
+ set are considered to be unconstrained and will run with no
+ restriction.
+ See tip/Documentation/scheduler/sched-bwc.txt for more information.
+
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on EXPERIMENTAL
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8ae..0e2cde4f380 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,54 +17,34 @@
* claimed NULL, 3 -> {pending} : claimed to be enqueued
* pending next, 3 -> {busy} : queued, pending callback
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
*/
#define IRQ_WORK_PENDING 1UL
#define IRQ_WORK_BUSY 2UL
#define IRQ_WORK_FLAGS 3UL
-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
-{
- return (unsigned long)entry->next & flags;
-}
-
-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
- unsigned long next = (unsigned long)entry->next;
- next &= ~IRQ_WORK_FLAGS;
- return (struct irq_work *)next;
-}
-
-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
- unsigned long next = (unsigned long)entry;
- next |= flags;
- return (struct irq_work *)next;
-}
-
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+static DEFINE_PER_CPU(struct llist_head, irq_work_list);
/*
* Claim the entry so that no one else will poke at it.
*/
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
{
- struct irq_work *next, *nflags;
+ unsigned long flags, nflags;
- do {
- next = entry->next;
- if ((unsigned long)next & IRQ_WORK_PENDING)
+ for (;;) {
+ flags = work->flags;
+ if (flags & IRQ_WORK_PENDING)
return false;
- nflags = next_flags(next, IRQ_WORK_FLAGS);
- } while (cmpxchg(&entry->next, next, nflags) != next);
+ nflags = flags | IRQ_WORK_FLAGS;
+ if (cmpxchg(&work->flags, flags, nflags) == flags)
+ break;
+ cpu_relax();
+ }
return true;
}
-
void __weak arch_irq_work_raise(void)
{
/*
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void)
/*
* Queue the entry and raise the IPI if needed.
*/
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
{
- struct irq_work *next;
+ bool empty;
preempt_disable();
- do {
- next = __this_cpu_read(irq_work_list);
- /* Can assign non-atomic because we keep the flags set. */
- entry->next = next_flags(next, IRQ_WORK_FLAGS);
- } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
-
+ empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
/* The list was empty, raise self-interrupt to start processing. */
- if (!irq_work_next(entry))
+ if (empty)
arch_irq_work_raise();
preempt_enable();
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry)
*
* Can be re-enqueued while the callback is still in progress.
*/
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
{
- if (!irq_work_claim(entry)) {
+ if (!irq_work_claim(work)) {
/*
* Already enqueued, can't do!
*/
return false;
}
- __irq_work_queue(entry);
+ __irq_work_queue(work);
return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
*/
void irq_work_run(void)
{
- struct irq_work *list;
+ struct irq_work *work;
+ struct llist_head *this_list;
+ struct llist_node *llnode;
- if (this_cpu_read(irq_work_list) == NULL)
+ this_list = &__get_cpu_var(irq_work_list);
+ if (llist_empty(this_list))
return;
BUG_ON(!in_irq());
BUG_ON(!irqs_disabled());
- list = this_cpu_xchg(irq_work_list, NULL);
-
- while (list != NULL) {
- struct irq_work *entry = list;
+ llnode = llist_del_all(this_list);
+ while (llnode != NULL) {
+ work = llist_entry(llnode, struct irq_work, llnode);
- list = irq_work_next(list);
+ llnode = llist_next(llnode);
/*
- * Clear the PENDING bit, after this point the @entry
+ * Clear the PENDING bit, after this point the @work
* can be re-used.
*/
- entry->next = next_flags(NULL, IRQ_WORK_BUSY);
- entry->func(entry);
+ work->flags = IRQ_WORK_BUSY;
+ work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&entry->next,
- next_flags(NULL, IRQ_WORK_BUSY),
- NULL);
+ (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
}
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
*/
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
{
WARN_ON_ONCE(irqs_disabled());
- while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+ while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/sched.c b/kernel/sched.c
index 03ad0113801..d87c6e5d4e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
- ktime_t now;
+ unsigned long delta;
+ ktime_t soft, hard, now;
+
+ for (;;) {
+ if (hrtimer_active(period_timer))
+ break;
+
+ now = hrtimer_cb_get_time(period_timer);
+ hrtimer_forward(period_timer, now, period);
+ soft = hrtimer_get_softexpires(period_timer);
+ hard = hrtimer_get_expires(period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(period_timer, soft, delta,
+ HRTIMER_MODE_ABS_PINNED, 0);
+ }
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
return;
raw_spin_lock(&rt_b->rt_runtime_lock);
- for (;;) {
- unsigned long delta;
- ktime_t soft, hard;
-
- if (hrtimer_active(&rt_b->rt_period_timer))
- break;
-
- now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
- hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-
- soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
- hard = hrtimer_get_expires(&rt_b->rt_period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
- }
+ start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -247,6 +250,24 @@ struct cfs_rq;
static LIST_HEAD(task_groups);
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota, runtime;
+ s64 hierarchal_quota;
+ u64 runtime_expires;
+
+ int idle, timer_active;
+ struct hrtimer period_timer, slack_timer;
+ struct list_head throttled_cfs_rq;
+
+ /* statistics */
+ int nr_periods, nr_throttled;
+ u64 throttled_time;
+#endif
+};
+
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
+
+ struct cfs_bandwidth cfs_bandwidth;
};
/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running;
+ unsigned long nr_running, h_nr_running;
u64 exec_clock;
u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ int runtime_enabled;
+ u64 runtime_expires;
+ s64 runtime_remaining;
+
+ u64 throttled_timestamp;
+ int throttled, throttle_count;
+ struct list_head throttled_list;
+#endif
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->runtime = 0;
+ cfs_b->quota = RUNTIME_INF;
+ cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->runtime_enabled = 0;
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ /*
+ * The timer may be active because we're trying to set a new bandwidth
+ * period or because we're racing with the tear-down path
+ * (timer_active==0 becomes visible before the hrtimer call-back
+ * terminates). In either case we ensure that it's re-programmed
+ */
+ while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ raw_spin_unlock(&cfs_b->lock);
+ /* ensure cfs_b->lock is available while we wait */
+ hrtimer_cancel(&cfs_b->period_timer);
+
+ raw_spin_lock(&cfs_b->lock);
+ /* if someone else restarted the timer then we're done */
+ if (cfs_b->timer_active)
+ return;
+ }
+
+ cfs_b->timer_active = 1;
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
+}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
@@ -510,7 +644,7 @@ struct rq {
unsigned long cpu_power;
- unsigned char idle_at_tick;
+ unsigned char idle_balance;
/* For active balancing */
int post_schedule;
int active_balance;
@@ -520,8 +654,6 @@ struct rq {
int cpu;
int online;
- unsigned long avg_load_per_task;
-
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
@@ -570,7 +702,7 @@ struct rq {
#endif
#ifdef CONFIG_SMP
- struct task_struct *wake_list;
+ struct llist_head wake_list;
#endif
};
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
+static inline bool got_nohz_idle_kick(void)
+{
+ return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+}
+
+#else /* CONFIG_NO_HZ */
+
+static inline bool got_nohz_idle_kick(void)
+{
+ return false;
+}
+
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
update_load_sub(&rq->load, load);
}
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
typedef int (*tg_visitor)(struct task_group *, void *);
/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
*/
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
- rcu_read_lock();
- parent = &root_task_group;
+ parent = from;
+
down:
ret = (*down)(parent, data);
if (ret)
- goto out_unlock;
+ goto out;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
@@ -1497,19 +1645,29 @@ up:
continue;
}
ret = (*up)(parent, data);
- if (ret)
- goto out_unlock;
+ if (ret || parent == from)
+ goto out;
child = parent;
parent = parent->parent;
if (parent)
goto up;
-out_unlock:
- rcu_read_unlock();
-
+out:
return ret;
}
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+ return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
static int tg_nop(struct task_group *tg, void *data)
{
return 0;
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
if (nr_running)
- rq->avg_load_per_task = rq->load.weight / nr_running;
- else
- rq->avg_load_per_task = 0;
+ return rq->load.weight / nr_running;
- return rq->avg_load_per_task;
+ return 0;
}
#ifdef CONFIG_PREEMPT
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags);
- inc_nr_running(rq);
}
/*
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible++;
dequeue_task(rq, p, flags);
- dec_nr_running(rq);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
/* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+ dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
if (dest_cpu < nr_cpu_ids)
return dest_cpu;
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
!cpu_online(cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_do_pending(struct task_struct *list)
+static void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
+ struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct task_struct *p;
raw_spin_lock(&rq->lock);
- while (list) {
- struct task_struct *p = list;
- list = list->wake_entry;
+ while (llist) {
+ p = llist_entry(llist, struct task_struct, wake_entry);
+ llist = llist_next(llist);
ttwu_do_activate(rq, p, 0);
}
raw_spin_unlock(&rq->lock);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void sched_ttwu_pending(void)
-{
- struct rq *rq = this_rq();
- struct task_struct *list = xchg(&rq->wake_list, NULL);
-
- if (!list)
- return;
-
- sched_ttwu_do_pending(list);
-}
-
-#endif /* CONFIG_HOTPLUG_CPU */
-
void scheduler_ipi(void)
{
- struct rq *rq = this_rq();
- struct task_struct *list = xchg(&rq->wake_list, NULL);
-
- if (!list)
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
return;
/*
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
- sched_ttwu_do_pending(list);
+ sched_ttwu_pending();
+
+ /*
+ * Check if someone kicked us for doing the nohz idle load balance.