diff options
-rw-r--r-- | Documentation/scheduler/sched-bwc.txt | 122 | ||||
-rw-r--r-- | drivers/acpi/apei/Kconfig | 1 | ||||
-rw-r--r-- | include/linux/irq_work.h | 15 | ||||
-rw-r--r-- | include/linux/llist.h | 77 | ||||
-rw-r--r-- | include/linux/sched.h | 7 | ||||
-rw-r--r-- | include/trace/events/sched.h | 9 | ||||
-rw-r--r-- | init/Kconfig | 12 | ||||
-rw-r--r-- | kernel/irq_work.c | 91 | ||||
-rw-r--r-- | kernel/sched.c | 666 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 89 | ||||
-rw-r--r-- | kernel/sched_cpupri.h | 7 | ||||
-rw-r--r-- | kernel/sched_fair.c | 761 | ||||
-rw-r--r-- | kernel/sched_features.h | 5 | ||||
-rw-r--r-- | kernel/sched_rt.c | 99 | ||||
-rw-r--r-- | kernel/sched_stoptask.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 10 | ||||
-rw-r--r-- | lib/Kconfig | 3 | ||||
-rw-r--r-- | lib/Makefile | 4 | ||||
-rw-r--r-- | lib/llist.c | 74 | ||||
-rw-r--r-- | lib/smp_processor_id.c | 2 |
20 files changed, 1646 insertions, 410 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt new file mode 100644 index 00000000000..f6b1873f68a --- /dev/null +++ b/Documentation/scheduler/sched-bwc.txt @@ -0,0 +1,122 @@ +CFS Bandwidth Control +===================== + +[ This document only discusses CPU bandwidth control for SCHED_NORMAL. + The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ] + +CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the +specification of the maximum CPU bandwidth available to a group or hierarchy. + +The bandwidth allowed for a group is specified using a quota and period. Within +each given "period" (microseconds), a group is allowed to consume only up to +"quota" microseconds of CPU time. When the CPU bandwidth consumption of a +group exceeds this limit (for that period), the tasks belonging to its +hierarchy will be throttled and are not allowed to run again until the next +period. + +A group's unused runtime is globally tracked, being refreshed with quota units +above at each period boundary. As threads consume this bandwidth it is +transferred to cpu-local "silos" on a demand basis. The amount transferred +within each of these updates is tunable and described as the "slice". + +Management +---------- +Quota and period are managed within the cpu subsystem via cgroupfs. + +cpu.cfs_quota_us: the total available run-time within a period (in microseconds) +cpu.cfs_period_us: the length of a period (in microseconds) +cpu.stat: exports throttling statistics [explained further below] + +The default values are: + cpu.cfs_period_us=100ms + cpu.cfs_quota=-1 + +A value of -1 for cpu.cfs_quota_us indicates that the group does not have any +bandwidth restriction in place, such a group is described as an unconstrained +bandwidth group. This represents the traditional work-conserving behavior for +CFS. + +Writing any (valid) positive value(s) will enact the specified bandwidth limit. +The minimum quota allowed for the quota or period is 1ms. There is also an +upper bound on the period length of 1s. Additional restrictions exist when +bandwidth limits are used in a hierarchical fashion, these are explained in +more detail below. + +Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit +and return the group to an unconstrained state once more. + +Any updates to a group's bandwidth specification will result in it becoming +unthrottled if it is in a constrained state. + +System wide settings +-------------------- +For efficiency run-time is transferred between the global pool and CPU local +"silos" in a batch fashion. This greatly reduces global accounting pressure +on large systems. The amount transferred each time such an update is required +is described as the "slice". + +This is tunable via procfs: + /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms) + +Larger slice values will reduce transfer overheads, while smaller values allow +for more fine-grained consumption. + +Statistics +---------- +A group's bandwidth statistics are exported via 3 fields in cpu.stat. + +cpu.stat: +- nr_periods: Number of enforcement intervals that have elapsed. +- nr_throttled: Number of times the group has been throttled/limited. +- throttled_time: The total time duration (in nanoseconds) for which entities + of the group have been throttled. + +This interface is read-only. + +Hierarchical considerations +--------------------------- +The interface enforces that an individual entity's bandwidth is always +attainable, that is: max(c_i) <= C. However, over-subscription in the +aggregate case is explicitly allowed to enable work-conserving semantics +within a hierarchy. + e.g. \Sum (c_i) may exceed C +[ Where C is the parent's bandwidth, and c_i its children ] + + +There are two ways in which a group may become throttled: + a. it fully consumes its own quota within a period + b. a parent's quota is fully consumed within its period + +In case b) above, even though the child may have runtime remaining it will not +be allowed to until the parent's runtime is refreshed. + +Examples +-------- +1. Limit a group to 1 CPU worth of runtime. + + If period is 250ms and quota is also 250ms, the group will get + 1 CPU worth of runtime every 250ms. + + # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */ + # echo 250000 > cpu.cfs_period_us /* period = 250ms */ + +2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine. + + With 500ms period and 1000ms quota, the group can get 2 CPUs worth of + runtime every 500ms. + + # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */ + # echo 500000 > cpu.cfs_period_us /* period = 500ms */ + + The larger period here allows for increased burst capacity. + +3. Limit a group to 20% of 1 CPU. + + With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU. + + # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */ + # echo 50000 > cpu.cfs_period_us /* period = 50ms */ + + By using a small period here we are ensuring a consistent latency + response at the expense of burst capacity. + diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index e3f47872ec2..f0c1ce95a0e 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -14,7 +14,6 @@ config ACPI_APEI_GHES depends on ACPI_APEI && X86 select ACPI_HED select IRQ_WORK - select LLIST select GENERIC_ALLOCATOR help Generic Hardware Error Source provides a way to report diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 4fa09d4d0b7..6a9e8f5399e 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -1,20 +1,23 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H +#include <linux/llist.h> + struct irq_work { - struct irq_work *next; + unsigned long flags; + struct llist_node llnode; void (*func)(struct irq_work *); }; static inline -void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *)) +void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) { - entry->next = NULL; - entry->func = func; + work->flags = 0; + work->func = func; } -bool irq_work_queue(struct irq_work *entry); +bool irq_work_queue(struct irq_work *work); void irq_work_run(void); -void irq_work_sync(struct irq_work *entry); +void irq_work_sync(struct irq_work *work); #endif /* _LINUX_IRQ_WORK_H */ diff --git a/include/linux/llist.h b/include/linux/llist.h index aa0c8b5b3cd..7287734e08d 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -35,10 +35,30 @@ * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the - * list can NOT be used in NMI handler. So code uses the list in NMI - * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + * list can NOT be used in NMI handlers. So code that uses the list in + * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + * + * Copyright 2010,2011 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/kernel.h> +#include <asm/system.h> +#include <asm/processor.h> + struct llist_head { struct llist_node *first; }; @@ -113,14 +133,55 @@ static inline void init_llist_head(struct llist_head *list) * test whether the list is empty without deleting something from the * list. */ -static inline int llist_empty(const struct llist_head *head) +static inline bool llist_empty(const struct llist_head *head) { return ACCESS_ONCE(head->first) == NULL; } -void llist_add(struct llist_node *new, struct llist_head *head); -void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, - struct llist_head *head); -struct llist_node *llist_del_first(struct llist_head *head); -struct llist_node *llist_del_all(struct llist_head *head); +static inline struct llist_node *llist_next(struct llist_node *node) +{ + return node->next; +} + +/** + * llist_add - add a new entry + * @new: new entry to be added + * @head: the head for your lock-less list + * + * Return whether list is empty before adding. + */ +static inline bool llist_add(struct llist_node *new, struct llist_head *head) +{ + struct llist_node *entry, *old_entry; + + entry = head->first; + for (;;) { + old_entry = entry; + new->next = entry; + entry = cmpxchg(&head->first, old_entry, new); + if (entry == old_entry) + break; + } + + return old_entry == NULL; +} + +/** + * llist_del_all - delete all entries from lock-less list + * @head: the head of lock-less list to delete all entries + * + * If list is empty, return NULL, otherwise, delete all entries and + * return the pointer to the first entry. The order of entries + * deleted is from the newest to the oldest added one. + */ +static inline struct llist_node *llist_del_all(struct llist_head *head) +{ + return xchg(&head->first, NULL); +} + +extern bool llist_add_batch(struct llist_node *new_first, + struct llist_node *new_last, + struct llist_head *head); +extern struct llist_node *llist_del_first(struct llist_head *head); + #endif /* LLIST_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ede8a6585e3..e8acce717d2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -90,6 +90,7 @@ struct sched_param { #include <linux/task_io_accounting.h> #include <linux/latencytop.h> #include <linux/cred.h> +#include <linux/llist.h> #include <asm/processor.h> @@ -1224,7 +1225,7 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct task_struct *wake_entry; + struct llist_node wake_entry; int on_cpu; #endif int on_rq; @@ -2035,6 +2036,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f6334782a59..959ff18b63b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) * For all intents and purposes a preempted task is a running task. */ if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) - state = TASK_RUNNING; + state = TASK_RUNNING | TASK_STATE_MAX; #endif return state; @@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch, __entry->next_prio = next->prio; ), - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state ? - __print_flags(__entry->prev_state, "|", + __entry->prev_state & (TASK_STATE_MAX-1) ? + __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, { 128, "W" }) : "R", + __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); diff --git a/init/Kconfig b/init/Kconfig index dc7e27bf89a..31ba0fd0f36 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED depends on CGROUP_SCHED default CGROUP_SCHED +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See tip/Documentation/scheduler/sched-bwc.txt for more information. + config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" depends on EXPERIMENTAL diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c58fa7da8ae..0e2cde4f380 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -17,54 +17,34 @@ * claimed NULL, 3 -> {pending} : claimed to be enqueued * pending next, 3 -> {busy} : queued, pending callback * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed - * - * We use the lower two bits of the next pointer to keep PENDING and BUSY - * flags. */ #define IRQ_WORK_PENDING 1UL #define IRQ_WORK_BUSY 2UL #define IRQ_WORK_FLAGS 3UL -static inline bool irq_work_is_set(struct irq_work *entry, int flags) -{ - return (unsigned long)entry->next & flags; -} - -static inline struct irq_work *irq_work_next(struct irq_work *entry) -{ - unsigned long next = (unsigned long)entry->next; - next &= ~IRQ_WORK_FLAGS; - return (struct irq_work *)next; -} - -static inline struct irq_work *next_flags(struct irq_work *entry, int flags) -{ - unsigned long next = (unsigned long)entry; - next |= flags; - return (struct irq_work *)next; -} - -static DEFINE_PER_CPU(struct irq_work *, irq_work_list); +static DEFINE_PER_CPU(struct llist_head, irq_work_list); /* * Claim the entry so that no one else will poke at it. */ -static bool irq_work_claim(struct irq_work *entry) +static bool irq_work_claim(struct irq_work *work) { - struct irq_work *next, *nflags; + unsigned long flags, nflags; - do { - next = entry->next; - if ((unsigned long)next & IRQ_WORK_PENDING) + for (;;) { + flags = work->flags; + if (flags & IRQ_WORK_PENDING) return false; - nflags = next_flags(next, IRQ_WORK_FLAGS); - } while (cmpxchg(&entry->next, next, nflags) != next); + nflags = flags | IRQ_WORK_FLAGS; + if (cmpxchg(&work->flags, flags, nflags) == flags) + break; + cpu_relax(); + } return true; } - void __weak arch_irq_work_raise(void) { /* @@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void) /* * Queue the entry and raise the IPI if needed. */ -static void __irq_work_queue(struct irq_work *entry) +static void __irq_work_queue(struct irq_work *work) { - struct irq_work *next; + bool empty; preempt_disable(); - do { - next = __this_cpu_read(irq_work_list); - /* Can assign non-atomic because we keep the flags set. */ - entry->next = next_flags(next, IRQ_WORK_FLAGS); - } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); - + empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); /* The list was empty, raise self-interrupt to start processing. */ - if (!irq_work_next(entry)) + if (empty) arch_irq_work_raise(); preempt_enable(); @@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry) * * Can be re-enqueued while the callback is still in progress. */ -bool irq_work_queue(struct irq_work *entry) +bool irq_work_queue(struct irq_work *work) { - if (!irq_work_claim(entry)) { + if (!irq_work_claim(work)) { /* * Already enqueued, can't do! */ return false; } - __irq_work_queue(entry); + __irq_work_queue(work); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); @@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue); */ void irq_work_run(void) { - struct irq_work *list; + struct irq_work *work; + struct llist_head *this_list; + struct llist_node *llnode; - if (this_cpu_read(irq_work_list) == NULL) + this_list = &__get_cpu_var(irq_work_list); + if (llist_empty(this_list)) return; BUG_ON(!in_irq()); BUG_ON(!irqs_disabled()); - list = this_cpu_xchg(irq_work_list, NULL); - - while (list != NULL) { - struct irq_work *entry = list; + llnode = llist_del_all(this_list); + while (llnode != NULL) { + work = llist_entry(llnode, struct irq_work, llnode); - list = irq_work_next(list); + llnode = llist_next(llnode); /* - * Clear the PENDING bit, after this point the @entry + * Clear the PENDING bit, after this point the @work * can be re-used. */ - entry->next = next_flags(NULL, IRQ_WORK_BUSY); - entry->func(entry); + work->flags = IRQ_WORK_BUSY; + work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&entry->next, - next_flags(NULL, IRQ_WORK_BUSY), - NULL); + (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); } } EXPORT_SYMBOL_GPL(irq_work_run); @@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ -void irq_work_sync(struct irq_work *entry) +void irq_work_sync(struct irq_work *work) { WARN_ON_ONCE(irqs_disabled()); - while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + while (work->flags & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/sched.c b/kernel/sched.c index 03ad0113801..d87c6e5d4e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - ktime_t now; + unsigned long delta; + ktime_t soft, hard, now; + + for (;;) { + if (hrtimer_active(period_timer)) + break; + + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; @@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) return; raw_spin_lock(&rt_b->rt_runtime_lock); - for (;;) { - unsigned long delta; - ktime_t soft, hard; - - if (hrtimer_active(&rt_b->rt_period_timer)) - break; - - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - - soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); - hard = hrtimer_get_expires(&rt_b->rt_period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -247,6 +250,24 @@ struct cfs_rq; static LIST_HEAD(task_groups); +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -278,6 +299,8 @@ struct task_group { #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif + + struct cfs_bandwidth cfs_bandwidth; }; /* task_group_lock serializes the addition/removal of task groups */ @@ -311,7 +334,7 @@ struct task_group root_task_group; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long nr_running; + unsigned long nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime; @@ -377,9 +400,120 @@ struct cfs_rq { unsigned long load_contribution; #endif +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_timestamp; + int throttled, throttle_count; + struct list_head throttled_list; +#endif #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_BANDWIDTH +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); + + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); +} +#else +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -510,7 +644,7 @@ struct rq { unsigned long cpu_power; - unsigned char idle_at_tick; + unsigned char idle_balance; /* For active balancing */ int post_schedule; int active_balance; @@ -520,8 +654,6 @@ struct rq { int cpu; int online; - unsigned long avg_load_per_task; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -570,7 +702,7 @@ struct rq { #endif #ifdef CONFIG_SMP - struct task_struct *wake_list; + struct llist_head wake_list; #endif }; @@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) typedef int (*tg_visitor)(struct task_group *, void *); /* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +static int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; - rcu_read_lock(); - parent = &root_task_group; + parent = from; + down: ret = (*down)(parent, data); if (ret) - goto out_unlock; + goto out; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1497,19 +1645,29 @@ up: continue; } ret = (*up)(parent, data); - if (ret) - goto out_unlock; + if (ret || parent == from) + goto out; child = parent; parent = parent->parent; if (parent) goto up; -out_unlock: - rcu_read_unlock(); - +out: return ret; } +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ + +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + static int tg_nop(struct task_group *tg, void *data) { return 0; @@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) - rq->avg_load_per_task = rq->load.weight / nr_running; - else - rq->avg_load_per_task = 0; + return rq->load.weight / nr_running; - return rq->avg_load_per_task; + return 0; } #ifdef CONFIG_PREEMPT @@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); - inc_nr_running(rq); } /* @@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); - dec_nr_running(rq); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); if (dest_cpu < nr_cpu_ids) return dest_cpu; @@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_do_pending(struct task_struct *list) +static void sched_ttwu_pending(void) { struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; raw_spin_lock(&rq->lock); - while (list) { - struct task_struct *p = list; - list = list->wake_entry; + while (llist) { + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); ttwu_do_activate(rq, p, 0); } raw_spin_unlock(&rq->lock); } -#ifdef CONFIG_HOTPLUG_CPU - -static void sched_ttwu_pending(void) -{ - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; - - sched_ttwu_do_pending(list); -} - -#endif /* CONFIG_HOTPLUG_CPU */ - void scheduler_ipi(void) { - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -2608,25 +2746,21 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - sched_ttwu_do_pending(list); + sched_ttwu_pending(); + + /* + * Check if someone kicked us for doing the nohz idle load balance. |