From dbda92d16f8655044e082930e4e9d244b87fde77 Mon Sep 17 00:00:00 2001 From: "Bu, Yitian" Date: Mon, 18 Feb 2013 12:53:37 +0000 Subject: printk: Fix rq->lock vs logbuf_lock unlock lock inversion commit 07354eb1a74d1 ("locking printk: Annotate logbuf_lock as raw") reintroduced a lock inversion problem which was fixed in commit 0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). This happened probably when fixing up patch rejects. Restore the ordering and unlock logbuf_lock before releasing console_sem. Signed-off-by: ybu Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/E807E903FE6CBE4D95E420FBFCC273B827413C@nasanexd01h.na.qualcomm.com Signed-off-by: Thomas Gleixner --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 267ce780abe..e698e80d842 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1358,9 +1358,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); if (wake) up(&console_sem); - raw_spin_unlock(&logbuf_lock); return retval; } -- cgit v1.2.3-70-g09d2 From 45ceebf77653975815d82fcf7cec0a164215ae11 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Apr 2013 15:10:49 -0400 Subject: sched: Factor out load calculation code from sched/core.c --> sched/proc.c This large chunk of load calculation code can be easily divorced from the main core.c scheduler file, with only a couple prototypes and externs added to a kernel/sched header. Some recent commits expanded the code and the documentation of it, making it large enough to warrant separation. For example, see: 556061b, "sched/nohz: Fix rq->cpu_load[] calculations" 5aaa0b7, "sched/nohz: Fix rq->cpu_load calculations some more" 5167e8d, "sched/nohz: Rewrite and fix load-avg computation -- again" More importantly, it helps reduce the size of the main sched/core.c by yet another significant amount (~600 lines). Signed-off-by: Paul Gortmaker Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1366398650-31599-2-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 569 ------------------------------------------------- kernel/sched/proc.c | 578 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 8 + 4 files changed, 587 insertions(+), 570 deletions(-) create mode 100644 kernel/sched/proc.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1d..54adcf35f49 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272f..bfa7e77e0b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,575 +2056,6 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - * nr_active = 0; - * for_each_possible_cpu(cpu) - * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach - * to calculating nr_active. - * - * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate - * to obtain the same result. See calc_load_fold_active(). - * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding - * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. - * - * This places an upper-bound on the IRQ-off latency of the machine. Then - * again, being late doesn't loose the delta, just wrecks the sample. - * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu - * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. - * - * This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -static long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - * - When we go NO_HZ idle during the window, we can negate our sample - * contribution, causing under-accounting. - * - * We avoid this by keeping two idle-delta counters and flipping them - * when the window starts, thus separating old and new NO_HZ load. - * - * The only trick is the slight shift in index flip for read vs write. - * - * 0s 5s 10s 15s - * +10 +10 +10 +10 - * |-|-----------|-|-----------|-|-----------|-| - * r:0 0 1 1 0 0 1 1 0 - * w:0 1 1 0 0 1 1 0 0 - * - * This ensures we'll fold the old idle contribution in this window while - * accumlating the new one. - * - * - When we wake up from NO_HZ idle during the window, we push up our - * contribution, since we effectively move our sample point to a known - * busy state. - * - * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which - * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ - int idx = calc_load_idx; - - /* - * See calc_global_nohz(), if we observe the new index, we also - * need to observe the new update time. - */ - smp_rmb(); - - /* - * If the folding window started, make sure we start writing in the - * next idle-delta. - */ - if (!time_before(jiffies, calc_load_update)) - idx++; - - return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ - return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ - struct rq *this_rq = this_rq(); - long delta; - - /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. - */ - delta = calc_load_fold_active(this_rq); - if (delta) { - int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); - } -} - -void calc_load_exit_idle(void) -{ - struct rq *this_rq = this_rq(); - - /* - * If we're still before the sample window, we're done. - */ - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - /* - * We woke inside or after the sample window, this means we're already - * accounted through the nohz accounting, so skip the entire deal and - * sync up for the next window. - */ - this_rq->calc_load_update = calc_load_update; - if (time_before(jiffies, this_rq->calc_load_update + 10)) - this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ - int idx = calc_load_read_idx(); - long delta = 0; - - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ - long delta, active, n; - - if (!time_before(jiffies, calc_load_update + 10)) { - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Flip the idle index... - * - * Make sure we first write the new time then flip the index, so that - * calc_load_write_idx() will see the new time when it reads the new - * index, this avoids a double flip messing things up. - */ - smp_wmb(); - calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active, delta; - - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; - - /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. - */ - calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Called from scheduler_tick() - */ -static void update_cpu_load_active(struct rq *this_rq) -{ - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); - - calc_load_account_active(this_rq); -} - #ifdef CONFIG_SMP /* diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 00000000000..bb3a6a0b862 --- /dev/null +++ b/kernel/sched/proc.c @@ -0,0 +1,578 @@ +/* + * kernel/sched/proc.c + * + * Kernel load calculations, forked from sched/core.c + */ + +#include + +#include "sched.h" + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); + + calc_load_account_active(this_rq); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d615..a38ee0a0650 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -10,8 +10,16 @@ #include "cpupri.h" #include "cpuacct.h" +struct rq; + extern __read_mostly int scheduler_running; +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern long calc_load_fold_active(struct rq *this_rq); +extern void update_cpu_load_active(struct rq *this_rq); + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -- cgit v1.2.3-70-g09d2 From 8527632dc95472adb571701e852479531c0567a2 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Apr 2013 15:10:50 -0400 Subject: sched: Move update_load_*() methods from sched.h to fair.c These inlines are only used by kernel/sched/fair.c so they do not need to be present in the main kernel/sched/sched.h file. Signed-off-by: Paul Gortmaker Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1366398650-31599-3-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++++++++++ kernel/sched/sched.h | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c..08a554dd3e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a38ee0a0650..f1f6256c122 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -892,24 +892,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that -- cgit v1.2.3-70-g09d2 From 424c93fe4cbe719e7fd7169248d2b648c493b68d Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 9 May 2013 11:24:03 -0500 Subject: sched: Use this_rq() helper It is a few instructions more efficent to and slightly more readable to use this_rq()-> instead of cpu_rq(smp_processor_id())-> . Size comparison of kernel/sched/fair.o: text data bss dec hex filename 27972 122 26 28120 6dd8 fair.o.before 27956 122 26 28104 6dc8 fair.o.after Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1368116643-87971-1-git-send-email-nzimmer@sgi.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- kernel/sched/rt.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c..f2c9c0c3406 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5418,10 +5418,9 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || !sd->nohz_idle) goto unlock; @@ -5436,10 +5435,9 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || sd->nohz_idle) goto unlock; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4a..7aced2e3b08 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -472,7 +472,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) #ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_rq(smp_processor_id())->rd->span; + return this_rq()->rd->span; } #else static inline const struct cpumask *sched_rt_period_mask(void) -- cgit v1.2.3-70-g09d2 From 1b1d2fb4444231f25ddabc598aa2b5a9c0833fba Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:08 +0000 Subject: lockdep: remove task argument from debug_check_no_locks_held The only existing caller to debug_check_no_locks_held calls it with 'current' as the task, and the freezer needs to call debug_check_no_locks_held but doesn't already have a current task pointer, so remove the argument. It is already assuming that the current task is relevant by dumping the current stack trace as part of the warning. This was originally part of 6aa9707099c (lockdep: check that no locks held at freeze time) which was reverted in dbf520a9d7d4. Original-author: Mandeep Singh Baines Acked-by: Pavel Machek Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- include/linux/debug_locks.h | 4 ++-- kernel/exit.c | 2 +- kernel/lockdep.c | 17 ++++++++--------- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 21ca773f77b..822c1354f3a 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(struct task_struct *task); +extern void debug_check_no_locks_held(void); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(struct task_struct *task) +debug_check_no_locks_held(void) { } #endif diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd49..e5975627500 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1f3186b37fd..e16c45b9ee7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void) { if (!debug_locks_off()) return; @@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); print_kernel_ident(); printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - + lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void) { - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3-70-g09d2 From 18ad0c6297df1d671ecea83b608cd9e432642a05 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:10 +0000 Subject: freezer: shorten freezer sleep time using exponential backoff All tasks can easily be frozen in under 10 ms, switch to using an initial 1 ms sleep followed by exponential backoff until 8 ms. Also convert the printed time to ms instead of centiseconds. Acked-by: Pavel Machek Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 98088e0e71e..fc0df848644 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only) unsigned int todo; bool wq_busy = false; struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; + u64 elapsed_msecs64; + unsigned int elapsed_msecs; bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; do_gettimeofday(&start); @@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the refrigerator. + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. */ - msleep(10); + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; } do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; + elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_msecs64, NSEC_PER_MSEC); + elapsed_msecs = elapsed_msecs64; if (todo) { printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " + printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", - elapsed_csecs / 100, elapsed_csecs % 100, + elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (!wakeup) { @@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); + printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + elapsed_msecs % 1000); } return todo ? -EBUSY : 0; -- cgit v1.2.3-70-g09d2 From 613f5d13b569859171f0896fbc73ee0bfa811fda Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:11 +0000 Subject: freezer: skip waking up tasks with PF_FREEZER_SKIP set Android goes through suspend/resume very often (every few seconds when on a busy wifi network with the screen off), and a significant portion of the energy used to go in and out of suspend is spent in the freezer. If a task has called freezer_do_not_count(), don't bother waking it up. If it happens to wake up later it will call freezer_count() and immediately enter the refrigerator. Combined with patches to convert freezable helpers to use freezer_do_not_count() and convert common sites where idle userspace tasks are blocked to use the freezable helpers, this reduces the time and energy required to suspend and resume. Acked-by: Tejun Heo Acked-by: Pavel Machek Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efb..8b2afc1c9df 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; + /* + * This check can race with freezer_do_not_count, but worst case that + * will result in an extra wakeup being sent to the task. It does not + * race with freezer_count(), the barriers in freezer_count() and + * freezer_should_skip() ensure that either freezer_count() sees + * freezing == true in try_to_freeze() and freezes, or + * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task + * normally. + */ + if (freezer_should_skip(p)) + return false; + spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p)) { spin_unlock_irqrestore(&freezer_lock, flags); -- cgit v1.2.3-70-g09d2 From 56467c7697f5aef6974501fbe2c3e63674583549 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:18 +0000 Subject: futex: use freezable blocking call Avoid waking up every thread sleeping in a futex_wait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Acked-by: Thomas Gleixner Acked-by: Darren Hart Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c9..d710fae8abb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -1807,7 +1808,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } -- cgit v1.2.3-70-g09d2 From b0f8c44f30e58c3aaaaaf864d5c3d3cc2e8a4c2d Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:19 +0000 Subject: nanosleep: use freezable blocking call Avoid waking up every thread sleeping in a nanosleep call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Acked-by: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f..3ee4d06c6fc 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod t->task = NULL; if (likely(t->task)) - schedule(); + freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; -- cgit v1.2.3-70-g09d2 From a2d5f1f5d941593e61071dc78e9de228eda5475f Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:20 +0000 Subject: sigtimedwait: use freezable blocking call Avoid waking up every thread sleeping in a sigtimedwait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 113411bfe8b..50e41075ac7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = schedule_timeout_interruptible(timeout); + timeout = freezable_schedule_timeout_interruptible(timeout); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); -- cgit v1.2.3-70-g09d2 From cee22a15052faa817e3ec8985a28154d3fabc7aa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 8 Apr 2013 16:45:40 +0530 Subject: workqueues: Introduce new flag WQ_POWER_EFFICIENT for power oriented workqueues Workqueues can be performance or power-oriented. Currently, most workqueues are bound to the CPU they were created on. This gives good performance (due to cache effects) at the cost of potentially waking up otherwise idle cores (Idle from scheduler's perspective. Which may or may not be physically idle) just to process some work. To save power, we can allow the work to be rescheduled on a core that is already awake. Workqueues created with the WQ_UNBOUND flag will allow some power savings. However, we don't change the default behaviour of the system. To enable power-saving behaviour, a new config option CONFIG_WQ_POWER_EFFICIENT needs to be turned on. This option can also be overridden by the workqueue.power_efficient boot parameter. tj: Updated config description and comments. Renamed CONFIG_WQ_POWER_EFFICIENT to CONFIG_WQ_POWER_EFFICIENT_DEFAULT. Signed-off-by: Viresh Kumar Reviewed-by: Amit Kucheria Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 15 +++++++++++++++ include/linux/workqueue.h | 27 +++++++++++++++++++++++++++ kernel/power/Kconfig | 20 ++++++++++++++++++++ kernel/workqueue.c | 13 +++++++++++++ 4 files changed, 75 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c3bfacb9291..37dfd720de7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3320,6 +3320,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that this also can be controlled per-workqueue for workqueues visible under /sys/bus/workqueue/. + workqueue.power_efficient + Per-cpu workqueues are generally preferred because + they show better performance thanks to cache + locality; unfortunately, per-cpu workqueues tend to + be more power hungry than unbound workqueues. + + Enabling this makes the per-cpu workqueues which + were observed to contribute significantly to power + consumption unbound, leading to measurably lower + power usage at the cost of small performance + overhead. + + The default value of this parameter is determined by + the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 623488fdc1f..fc0136b604f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -303,6 +303,33 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ + /* + * Per-cpu workqueues are generally preferred because they tend to + * show better performance thanks to cache locality. Per-cpu + * workqueues exclude the scheduler from choosing the CPU to + * execute the worker threads, which has an unfortunate side effect + * of increasing power consumption. + * + * The scheduler considers a CPU idle if it doesn't have any task + * to execute and tries to keep idle cores idle to conserve power; + * however, for example, a per-cpu work item scheduled from an + * interrupt handler on an idle CPU will force the scheduler to + * excute the work item on that CPU breaking the idleness, which in + * turn may lead to more scheduling choices which are sub-optimal + * in terms of power consumption. + * + * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default + * but become unbound if workqueue.power_efficient kernel param is + * specified. Per-cpu workqueues which are identified to + * contribute significantly to power-consumption are identified and + * marked with this flag and enabling the power_efficient mode + * leads to noticeable power saving at the cost of small + * performance disadvantage. + * + * http://thread.gmane.org/gmane.linux.kernel/1480396 + */ + WQ_POWER_EFFICIENT = 1 << 7, + __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180..46455961a88 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,26 @@ config PM_GENERIC_DOMAINS bool depends on PM +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2..8068d97ce14 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); +/* see the comment above the definition of WQ_POWER_EFFICIENT */ +#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT +static bool wq_power_efficient = true; +#else +static bool wq_power_efficient; +#endif + +module_param_named(power_efficient, wq_power_efficient, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -4085,6 +4094,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ + if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) + flags |= WQ_UNBOUND; + /* allocate wq and format name */ if (flags & WQ_UNBOUND) tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); -- cgit v1.2.3-70-g09d2 From 0668106ca3865ba945e155097fb042bf66d364d3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 Apr 2013 17:12:54 +0530 Subject: workqueue: Add system wide power_efficient workqueues This patch adds system wide workqueues aligned towards power saving. This is done by allocating them with WQ_UNBOUND flag if 'wq_power_efficient' is set to 'true'. tj: updated comments a bit. Signed-off-by: Viresh Kumar Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 8 ++++++++ kernel/workqueue.c | 13 ++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fc0136b604f..a9f4119c7e2 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -360,11 +360,19 @@ enum { * * system_freezable_wq is equivalent to system_wq except that it's * freezable. + * + * *_power_efficient_wq are inclined towards saving power and converted + * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, + * they are same as their non-power-efficient counterparts - e.g. + * system_power_efficient_wq is identical to system_wq if + * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; +extern struct workqueue_struct *system_power_efficient_wq; +extern struct workqueue_struct *system_freezable_power_efficient_wq; static inline struct workqueue_struct * __deprecated __system_nrt_wq(void) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8068d97ce14..16ca2d3dd29 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -314,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_power_efficient_wq); +struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, @@ -4987,8 +4991,15 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_power_efficient_wq = alloc_workqueue("events_power_efficient", + WQ_POWER_EFFICIENT, 0); + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + WQ_FREEZABLE | WQ_POWER_EFFICIENT, + 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_power_efficient_wq || + !system_freezable_power_efficient_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.3-70-g09d2 From fa3ca07e96185aa1496b405472399a2a2a336a17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:56 -0700 Subject: cgroup: refactor hierarchy_id handling We're planning to converting hierarchy_ida to an idr and use it to look up hierarchy from its id. As we want the mapping to happen atomically with cgroupfs_root registration, this patch refactors hierarchy_id init / exit so that ida operations happen inside cgroup_[root_]mutex. * s/init_root_id()/cgroup_init_root_id()/ and make it return 0 or -errno like a normal function. * Move hierarchy_id initialization from cgroup_root_from_opts() into cgroup_mount() block where the root is confirmed to be used and being registered while holding both mutexes. * Split cgroup_drop_id() into cgroup_exit_root_id() and cgroup_free_root(), so that ID release can happen before dropping the mutexes in cgroup_kill_sb(). The latter expects hierarchy_id to be exited before being invoked. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 56 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f8..dbc84f7d23b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1426,13 +1426,13 @@ static void init_cgroup_root(struct cgroupfs_root *root) list_add_tail(&cgrp->allcg_node, &root->allcg_list); } -static bool init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroupfs_root *root) { - int ret = 0; + int ret; do { if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return false; + return -ENOMEM; spin_lock(&hierarchy_id_lock); /* Try to allocate the next unused ID */ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, @@ -1448,7 +1448,18 @@ static bool init_root_id(struct cgroupfs_root *root) } spin_unlock(&hierarchy_id_lock); } while (ret); - return true; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + if (root->hierarchy_id) { + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + + root->hierarchy_id = 0; + } } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1482,10 +1493,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) if (!root) return ERR_PTR(-ENOMEM); - if (!init_root_id(root)) { - kfree(root); - return ERR_PTR(-ENOMEM); - } init_cgroup_root(root); root->subsys_mask = opts->subsys_mask; @@ -1500,17 +1507,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_drop_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroupfs_root *root) { - if (!root) - return; + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); - BUG_ON(!root->hierarchy_id); - spin_lock(&hierarchy_id_lock); - ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - ida_destroy(&root->cgroup_ida); - kfree(root); + ida_destroy(&root->cgroup_ida); + kfree(root); + } } static int cgroup_set_super(struct super_block *sb, void *data) @@ -1597,7 +1602,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); goto drop_modules; } @@ -1641,6 +1646,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + ret = cgroup_init_root_id(root); + if (ret) + goto unlock_drop; + ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { free_cg_links(&tmp_cg_links); @@ -1684,7 +1693,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * We re-used an existing hierarchy - the new root (if * any) is not needed */ - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && root->flags != opts.flags) { @@ -1702,6 +1711,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return dget(sb->s_root); unlock_drop: + cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1754,13 +1764,15 @@ static void cgroup_kill_sb(struct super_block *sb) { root_count--; } + cgroup_exit_root_id(root); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); simple_xattrs_free(&cgrp->xattrs); kill_litter_super(sb); - cgroup_drop_root(root); + cgroup_free_root(root); } static struct file_system_type cgroup_fs_type = { @@ -4642,7 +4654,9 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(!init_root_id(&rootnode)); + + /* allocate id for the dummy hierarchy */ + BUG_ON(cgroup_init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { -- cgit v1.2.3-70-g09d2 From 54e7b4eb15fc4354d5ada5469e3db4a220ddb3ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:57 -0700 Subject: cgroup: drop hierarchy_id_lock Now that hierarchy_id alloc / free are protected by the cgroup mutexes, there's no need for this separate lock. Drop it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dbc84f7d23b..3ef677d314b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -189,9 +189,13 @@ struct cgroup_event { static LIST_HEAD(roots); static int root_count; +/* + * Hierarchy ID allocation and mapping. It follows the same exclusion + * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for + * writes, either for reads. + */ static DEFINE_IDA(hierarchy_ida); static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -1430,10 +1434,12 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) { int ret; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + do { if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock(&hierarchy_id_lock); /* Try to allocate the next unused ID */ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, &root->hierarchy_id); @@ -1446,18 +1452,17 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) /* Can only get here if the 31-bit IDR is full ... */ BUG_ON(ret); } - spin_unlock(&hierarchy_id_lock); } while (ret); return 0; } static void cgroup_exit_root_id(struct cgroupfs_root *root) { + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + if (root->hierarchy_id) { - spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - root->hierarchy_id = 0; } } @@ -4656,8 +4661,14 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, key); /* allocate id for the dummy hierarchy */ + mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); + BUG_ON(cgroup_init_root_id(&rootnode)); + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { err = -ENOMEM; -- cgit v1.2.3-70-g09d2 From 1a574231669f8c3065c83974e9557fcbbd94b8a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:58 -0700 Subject: cgroup: make hierarchy_id use cyclic idr We want to be able to lookup a hierarchy from its id and cyclic allocation is a whole lot simpler with idr. Convert to idr and use idr_alloc_cyclc(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ef677d314b..dcb417c6c24 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -194,8 +194,7 @@ static int root_count; * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for * writes, either for reads. */ -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; +static DEFINE_IDR(cgroup_hierarchy_idr); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -1432,27 +1431,16 @@ static void init_cgroup_root(struct cgroupfs_root *root) static int cgroup_init_root_id(struct cgroupfs_root *root) { - int ret; + int id; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_root_mutex); - do { - if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return -ENOMEM; - /* Try to allocate the next unused ID */ - ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, - &root->hierarchy_id); - if (ret == -ENOSPC) - /* Try again starting from 0 */ - ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); - if (!ret) { - next_hierarchy_id = root->hierarchy_id + 1; - } else if (ret != -EAGAIN) { - /* Can only get here if the 31-bit IDR is full ... */ - BUG_ON(ret); - } - } while (ret); + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 2, 0, GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; return 0; } @@ -1462,7 +1450,7 @@ static void cgroup_exit_root_id(struct cgroupfs_root *root) lockdep_assert_held(&cgroup_root_mutex); if (root->hierarchy_id) { - ida_remove(&hierarchy_ida, root->hierarchy_id); + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); root->hierarchy_id = 0; } } -- cgit v1.2.3-70-g09d2 From 857a2beb09ab83e9a8185821ae16db7dfbe8b837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 20:50:08 -0700 Subject: cgroup: implement task_cgroup_path_from_hierarchy() kdbus folks want a sane way to determine the cgroup path that a given task belongs to on a given hierarchy, which is a reasonble thing to expect from cgroup core. Implement task_cgroup_path_from_hierarchy(). v2: Dropped unnecessary NULL check on the return value of task_cgroup_from_root() as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Li Zefan Cc: Kay Sievers Cc: Lennart Poettering Cc: Daniel Mack --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5047355b9a0..383c630f36f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -542,6 +542,8 @@ int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, + char *buf, size_t buflen); int cgroup_task_count(const struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dcb417c6c24..6b2b1d945df 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1827,6 +1827,38 @@ out: } EXPORT_SYMBOL_GPL(cgroup_path); +/** + * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * @task: target task + * @hierarchy_id: the hierarchy to look up @task's cgroup from + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and + * copy its path into @buf. This function grabs cgroup_mutex and shouldn't + * be used inside locks used by cgroup controller callbacks. + */ +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, + char *buf, size_t buflen) +{ + struct cgroupfs_root *root; + struct cgroup *cgrp = NULL; + int ret = -ENOENT; + + mutex_lock(&cgroup_mutex); + + root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); + if (root) { + cgrp = task_cgroup_from_root(task, root); + ret = cgroup_path(cgrp, buf, buflen); + } + + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); + /* * Control Group taskset */ -- cgit v1.2.3-70-g09d2 From cb65537ee1134d3cc55c1fa83952bc8eb1212833 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 May 2013 19:50:26 +0100 Subject: Add wait_on_atomic_t() and wake_up_atomic_t() Add wait_on_atomic_t() and wake_up_atomic_t() to indicate became-zero events on atomic_t types. This uses the bit-wake waitqueue table. The key is set to a value outside of the number of bits in a long so that wait_on_bit() won't be woken up accidentally. What I'm using this for is: in a following patch I add a counter to struct fscache_cookie to count the number of outstanding operations that need access to netfs data. The way this works is: (1) When a cookie is allocated, the counter is initialised to 1. (2) When an operation wants to access netfs data, it calls atomic_inc_unless() to increment the counter before it does so. If it was 0, then the counter isn't incremented, the operation isn't permitted to access the netfs data (which might by this point no longer exist) and the operation aborts in some appropriate manner. (3) When an operation finishes with the netfs data, it decrements the counter and if it reaches 0, calls wake_up_atomic_t() on it - the assumption being that it was the last blocker. (4) When a cookie is released, the counter is decremented and the releaser uses wait_on_atomic_t() to wait for the counter to become 0 - which should indicate no one is using the netfs data any longer. The netfs data can then be destroyed. There are some alternatives that I have thought of and that have been suggested by Tejun Heo: (A) Using wait_on_bit() to wait on a bit in the counter. This doesn't work because if that bit happens to be 0 then the wait won't happen - even if the counter is non-zero. (B) Using wait_on_bit() to wait on a flag elsewhere which is cleared when the counter reaches 0. Such a flag would be redundant and would add complexity. (C) Adding a waitqueue to fscache_cookie - this would expand that struct by several words for an event that happens just once in each cookie's lifetime. Further, cookies are generally per-file so there are likely to be a lot of them. (D) Similar to (C), but add a pointer to a waitqueue in the cookie instead of a waitqueue. This would add single word per cookie and so would be less of an expansion - but still an expansion. (E) Adding a static waitqueue to the fscache module. Generally this would be fine, but under certain circumstances many cookies will all get added at the same time (eg. NFS umount, cache withdrawal) thereby presenting scaling issues. Note that the wait may be significant as disk I/O may be in progress. So, I think reusing the wait_on_bit() waitqueue set is reasonable. I don't make much use of the waitqueue I need on a per-cookie basis, but sometimes I have a huge flood of the cookies to deal with. I also don't want to add a whole new set of global waitqueue tables specifically for the dec-to-0 event if I can reuse the bit tables. Signed-off-by: David Howells Tested-By: Milosz Tanski Acked-by: Jeff Layton --- include/linux/wait.h | 24 ++++++++++++++ kernel/wait.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index ac38be2692d..5bacfc4b336 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -23,6 +23,7 @@ struct __wait_queue { struct wait_bit_key { void *flags; int bit_nr; +#define WAIT_ATOMIC_T_BIT_NR -1 }; struct wait_bit_queue { @@ -60,6 +61,9 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } +#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ + { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } + extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *); #define init_waitqueue_head(q) \ @@ -146,8 +150,10 @@ void __wake_up_bit(wait_queue_head_t *, void *, int); int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); void wake_up_bit(void *, int); +void wake_up_atomic_t(atomic_t *); int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned); int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned); +int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) @@ -896,5 +902,23 @@ static inline int wait_on_bit_lock(void *word, int bit, return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } + +/** + * wait_on_atomic_t - Wait for an atomic_t to become 0 + * @val: The atomic value being waited on, a kernel virtual address + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for + * the purpose of getting a waitqueue, but we set the key to a bit number + * outside of the target 'word'. + */ +static inline +int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) +{ + if (atomic_read(val) == 0) + return 0; + return out_of_line_wait_on_atomic_t(val, action, mode); +} #endif diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ea..ce0daa320a2 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit) return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; } EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @word: The word being waited on, a kernel virtual address + * @bit: The bit of the word being waited on + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); -- cgit v1.2.3-70-g09d2 From 5d33b883aed81c6fbcd09c6f7c3619eee850a7e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:43 +0000 Subject: clocksource: Always verify highres capability If a clocksource has a (wrong) high rating, but can't be used as a timebase for oneshot tick mode, it is unconditionally selected even when the system is already in oneshot tick mode. This causes full system failure. Verify the clocksource selection against the oneshot mode. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.635040849@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141..dda5c7130d9 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,6 +553,26 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET +static struct clocksource *clocksource_find_best(bool oneshot) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + /** * clocksource_select - Select the best clocksource available * @@ -563,12 +583,14 @@ static u64 clocksource_max_deferment(struct clocksource *cs) */ static void clocksource_select(void) { + bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; - if (!finished_booting || list_empty(&clocksource_list)) + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (strcmp(cs->name, override_name) != 0) @@ -578,8 +600,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " -- cgit v1.2.3-70-g09d2 From ba919d1caa2e624eb8c6cae1f2ce0a253e697d45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Let timekeeping_notify return success/error timekeeping_notify() can fail due cs->enable() failure. Though the caller does not notice and happily keeps the wrong clocksource as the current one. Let the caller know about failure, so the current clocksource will be shown correctly in sysfs. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.696321912@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 +- kernel/time/clocksource.c | 6 +++--- kernel/time/timekeeping.c | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 7279b94c01d..aa6ba44e75d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -321,7 +321,7 @@ static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz) } -extern void timekeeping_notify(struct clocksource *clock); +extern int timekeeping_notify(struct clocksource *clock); extern cycle_t clocksource_mmio_readl_up(struct clocksource *); extern cycle_t clocksource_mmio_readl_down(struct clocksource *); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dda5c7130d9..1923a340bd9 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -611,10 +611,10 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe4..da6e10c7a37 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -648,14 +648,15 @@ static int change_clocksource(void *data) * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &timekeeper; if (tk->clock == clock) - return; + return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); + return tk->clock == clock ? 0 : -1; } /** -- cgit v1.2.3-70-g09d2 From 09ac369c825d9d593404306d59062d854b321e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Add module refcount Add a module refcount, so the current clocksource cannot be removed unconditionally. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.762417789@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 3 +++ kernel/time/timekeeping.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index aa6ba44e75d..32a895b114d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -21,6 +21,7 @@ /* clocksource cycle base type */ typedef u64 cycle_t; struct clocksource; +struct module; #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA #include @@ -162,6 +163,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @suspend: suspend function for the clocksource, if necessary * @resume: resume function for the clocksource, if necessary * @cycle_last: most recent cycle counter value seen by ::read() + * @owner: module reference, must be set by clocksource in modules */ struct clocksource { /* @@ -195,6 +197,7 @@ struct clocksource { cycle_t cs_last; cycle_t wd_last; #endif + struct module *owner; } ____cacheline_aligned; /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index da6e10c7a37..933efa4071c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -627,11 +627,20 @@ static int change_clocksource(void *data) write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); - if (!new->enable || new->enable(new) == 0) { - old = tk->clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } } timekeeping_update(tk, true, true); -- cgit v1.2.3-70-g09d2 From f5a2e34375a5e2b711aea488ac3ae50eeba6d57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Allow clocksource select to skip current clocksource Preparatory patch for clocksource unbind support. Split out code from clocksource_select and modify it, so it skips the current clocksource on request and tries to find a fallback clocksource. Convert all existing users. No functional change. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.834965397@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1923a340bd9..9782997cb6c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,7 +553,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -566,6 +566,8 @@ static struct clocksource *clocksource_find_best(bool oneshot) * the best rating. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; return cs; @@ -573,26 +575,20 @@ static struct clocksource *clocksource_find_best(bool oneshot) return NULL; } -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static void __clocksource_select(bool skipcur) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot); + best = clocksource_find_best(oneshot, skipcur); if (!best) return; /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -618,6 +614,19 @@ static void clocksource_select(void) } } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } -- cgit v1.2.3-70-g09d2 From 29b5407819f59731c9423238fae03b756822708c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Split out user string input Split out the user string input for clocksource override. Preparatory patch for unbind. [ jstultz: Fix an off by one error ] Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.895851338@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9782997cb6c..d7f1a45c2fa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +#define CS_NAME_LEN 32 +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -838,6 +839,23 @@ sysfs_show_current_clocksources(struct device *dev, return count; } +static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused @@ -852,22 +870,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + size_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = clocksource_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); -- cgit v1.2.3-70-g09d2 From 7eaeb34305dee26634f7c98ae62646da5cebe91d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Provide unbind interface in sysfs With the module refcount held for the current clocksource there is no way to unload the module. Provide a sysfs interface which allows to unbind the clocksource. One could argue that the clocksource override could be (ab)used to do so, but the clocksource override cannot be used from the kernel itself, while an unbind function can be used to programmatically check whether a clocksource can be shutdown or not. The unbind functionality uses the new skip current feature of clocksource_select and verifies that a fallback clocksource has been installed. If the clocksource which should be unbound is the current clocksource and no fallback can be found, unbind returns -EBUSY. This does not support the unbinding of a clocksource which is used as the watchdog clocksource. No point in fostering crappy hardware. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.964218245@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d7f1a45c2fa..791d1aeb17a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -440,6 +440,11 @@ static int clocksource_watchdog_kthread(void *data) return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -451,6 +456,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -628,6 +634,11 @@ static void clocksource_select(void) return __clocksource_select(false); } +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -803,6 +814,29 @@ void clocksource_change_rating(struct clocksource *cs, int rating) } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered @@ -883,6 +917,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +/** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + size_t ret; + + ret = clocksource_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused @@ -925,6 +993,8 @@ sysfs_show_available_clocksources(struct device *dev, static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); @@ -948,6 +1018,9 @@ static int __init init_clocksource_sysfs(void) error = device_create_file( &device_clocksource, &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); if (!error) error = device_create_file( &device_clocksource, -- cgit v1.2.3-70-g09d2 From a89c7edbe7d7aa80f507915f3dd801211b116b79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Let clocksource_unregister() return success/error The unregister call can fail, if the clocksource is the current one and there is no replacement clocksource available. It can also fail, if the clocksource is the watchdog clocksource and I'm not going to provide support for this. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.029915527@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 +- kernel/time/clocksource.c | 33 ++++++++++++--------------------- 2 files changed, 13 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 32a895b114d..2f39a491166 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -282,7 +282,7 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) extern int clocksource_register(struct clocksource*); -extern void clocksource_unregister(struct clocksource*); +extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 791d1aeb17a..31b90332f47 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -389,28 +389,17 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -841,13 +830,15 @@ static int clocksource_unbind(struct clocksource *cs) * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); -- cgit v1.2.3-70-g09d2 From 7172a286ced0c1f4f239a0fa09db54ed37d3ead2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:47 +0000 Subject: clockevents: Get rid of the notifier chain 7+ years and still a single user. Kill it. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.098520211@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 - kernel/time/clockevents.c | 35 +++-------------------------------- kernel/time/tick-broadcast.c | 5 ++--- kernel/time/tick-common.c | 30 +++++------------------------- kernel/time/tick-internal.h | 7 ++++--- 5 files changed, 14 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 963d7143138..2f498f66c1b 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -150,7 +150,6 @@ extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); extern void clockevents_set_mode(struct clock_event_device *dev, enum clock_event_mode mode); -extern int clockevents_register_notifier(struct notifier_block *nb); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee13..dd70b4842c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "tick-internal.h" @@ -23,10 +22,6 @@ /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); @@ -232,30 +227,6 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - /* * Called after a notify add to make devices available which were * released from the notifier call. @@ -269,7 +240,7 @@ static void clockevents_notify_released(void) struct clock_event_device, list); list_del(&dev->list); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); } } @@ -290,7 +261,7 @@ void clockevents_register_device(struct clock_event_device *dev) raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -433,7 +404,7 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); + tick_notify(reason, arg); switch (reason) { case CLOCK_EVT_NOTIFY_CPU_DEAD: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d57766..3500caaa0bf 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -64,7 +64,7 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; @@ -72,7 +72,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) @@ -90,7 +90,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) */ if (dev->features & CLOCK_EVT_FEAT_ONESHOT) tick_clock_notify(); - return 1; } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc0..dbf4e18d510 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -208,11 +208,11 @@ static void tick_setup_device(struct tick_device *td, /* * Check, if the new registered device should be used. */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; + int cpu; unsigned long flags; raw_spin_lock_irqsave(&tick_device_lock, flags); @@ -275,18 +275,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_oneshot_notify(); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - + tick_install_broadcast_device(newdev); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; } /* @@ -360,17 +356,10 @@ static void tick_resume(void) raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) +void tick_notify(unsigned long reason, void *dev) { switch (reason) { - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -404,21 +393,12 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, default: break; } - - return NOTIFY_OK; } -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae460..60742fe6f63 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,6 +18,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_notify(unsigned long reason, void *dev); +extern void tick_check_new_device(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); @@ -90,7 +92,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +104,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) -- cgit v1.2.3-70-g09d2 From 7126cac426137633e470167524e7bcb590fd49b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Simplify locking Now that the notifier chain is gone there are no other users and it's pointless to nest tick_device_lock inside of clockevents_lock because there is no other use case. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.162888472@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index dbf4e18d510..170a4bdfa99 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -206,16 +205,14 @@ static void tick_setup_device(struct tick_device *td, } /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -273,8 +270,6 @@ void tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); return; out_bc: @@ -282,7 +277,6 @@ out_bc: * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* @@ -311,9 +305,7 @@ static void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -325,26 +317,20 @@ static void tick_shutdown(unsigned int *cpup) dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -353,9 +339,11 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } +/* + * Called with clockevents_lock held and interrupts disabled + */ void tick_notify(unsigned long reason, void *dev) { switch (reason) { -- cgit v1.2.3-70-g09d2 From 8c53daf63f56791ed47fc585206ef3049489612f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Move the tick_notify() switch case to clockevents_notify() No need to call another function and have duplicated cases. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.235746557@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 28 ++++++++++++++++++++++++- kernel/time/tick-common.c | 50 ++++----------------------------------------- kernel/time/tick-internal.h | 5 ++++- 3 files changed, 35 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dd70b4842c6..0e3a8448e11 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -404,10 +404,36 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 170a4bdfa99..84c7cfca4d7 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -284,7 +284,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -301,7 +301,7 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; @@ -319,14 +319,14 @@ static void tick_shutdown(unsigned int *cpup) } } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); clockevents_shutdown(td->evtdev); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); int broadcast = tick_resume_broadcast(); @@ -341,48 +341,6 @@ static void tick_resume(void) } } -/* - * Called with clockevents_lock held and interrupts disabled - */ -void tick_notify(unsigned long reason, void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } -} - /** * tick_init - initialize the tick control */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 60742fe6f63..06bfc8802df 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,8 +18,11 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); -extern void tick_notify(unsigned long reason, void *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); extern void clockevents_shutdown(struct clock_event_device *dev); -- cgit v1.2.3-70-g09d2 From ccf33d6880f39a35158fff66db13000ae4943fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Add module refcount We want to be able to remove clockevent modules as well. Add a refcount so we don't remove a module with an active clock event device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.307435149@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 3 +++ kernel/time/clockevents.c | 1 + kernel/time/tick-broadcast.c | 3 +++ kernel/time/tick-common.c | 4 ++++ 4 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 2f498f66c1b..ae1193bcf07 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -30,6 +30,7 @@ enum clock_event_nofitiers { #include struct clock_event_device; +struct module; /* Clock event mode commands */ enum clock_event_mode { @@ -83,6 +84,7 @@ enum clock_event_mode { * @irq: IRQ number (only for non CPU local devices) * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code + * @owner: module reference */ struct clock_event_device { void (*event_handler)(struct clock_event_device *); @@ -112,6 +114,7 @@ struct clock_event_device { int irq; const struct cpumask *cpumask; struct list_head list; + struct module *owner; } ____cacheline_aligned; /* diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0e3a8448e11..89e394caa76 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -357,6 +357,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3500caaa0bf..0e374cd2e0e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -73,6 +74,8 @@ void tick_install_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return; + if (!try_module_get(dev->owner)) + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 84c7cfca4d7..433a1e11d13 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -257,6 +258,9 @@ void tick_check_new_device(struct clock_event_device *newdev) goto out_bc; } + if (!try_module_get(newdev->owner)) + return; + /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do -- cgit v1.2.3-70-g09d2 From 501f867064e95f9a6f540e60705be0937280e7ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Provide sysfs interface Provide a simple sysfs interface for the clockevent devices. Show the current active clockevent device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.371634778@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 89e394caa76..0a23f4f2993 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -460,4 +461,89 @@ void clockevents_notify(unsigned long reason, void *arg) raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ -- cgit v1.2.3-70-g09d2 From 45cb8e01b2ecef1c2afb18333e95793fa1a90281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Split out selection logic Split out the clockevent device selection logic. Preparatory patch to allow unbinding active clockevent devices. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.431796247@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 25 ++++++++++++---- kernel/time/tick-common.c | 69 +++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0e374cd2e0e..d067c01586f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -65,19 +65,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; - if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || - (tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if (!tick_check_broadcast_device(cur, dev)) return; + if (!try_module_get(dev->owner)) return; - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 433a1e11d13..c3402165034 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,37 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* Use the higher rated one */ + return !curdev || newdev->rating > curdev->rating; +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. @@ -223,40 +254,12 @@ void tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; if (!try_module_get(newdev->owner)) return; -- cgit v1.2.3-70-g09d2 From 03e13cf5ee60584fe0c831682c67212effb7fca4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Implement unbind functionality Provide a sysfs interface to allow unbinding of clockevent devices. The device is unbound if it is unused or if there is a replacement device available. Unbinding of broadcast devices is not supported as we don't want to foster that nonsense. If no replacement device is available the unbind returns -EBUSY. Unbind is available from the kernel and through sysfs, which is necessary to drop the module refcount. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.499216659@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 + kernel/time/clockevents.c | 125 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/clocksource.c | 9 ++-- kernel/time/tick-common.c | 24 +++++++++ kernel/time/tick-internal.h | 7 +++ 5 files changed, 162 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index ae1193bcf07..0857922e8ad 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -141,6 +141,7 @@ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); +extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); extern void clockevents_config(struct clock_event_device *dev, u32 freq); extern void clockevents_config_and_register(struct clock_event_device *dev, diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0a23f4f2993..38959c86678 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -25,6 +25,13 @@ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -245,6 +252,90 @@ static void clockevents_notify_released(void) } } +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -487,6 +578,38 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev, } static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + size_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", @@ -529,6 +652,8 @@ static int __init tick_init_sysfs(void) err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 31b90332f47..6d05b00410c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@ #include #include +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -174,7 +176,6 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -#define CS_NAME_LEN 32 static char override_name[CS_NAME_LEN]; static int finished_booting; @@ -864,7 +865,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -899,7 +900,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, mutex_lock(&clocksource_mutex); - ret = clocksource_get_uname(buf, override_name, count); + ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) clocksource_select(); @@ -925,7 +926,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, char name[CS_NAME_LEN]; size_t ret; - ret = clocksource_get_uname(buf, name, count); + ret = sysfs_get_uname(buf, name, count); if (ret < 0) return ret; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c3402165034..5edfb480603 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,17 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { @@ -236,6 +247,19 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return !curdev || newdev->rating > curdev->rating; } +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 06bfc8802df..be1690eaecf 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -11,6 +11,8 @@ extern seqlock_t jiffies_lock; #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 +#define CS_NAME_LEN 32 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; @@ -23,9 +25,14 @@ extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3-70-g09d2 From bdc7119f1bdd0632d42f435941dc290216a436e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: make cgroup_is_removed() static cgroup_is_removed() no longer has external users and it shouldn't grow any - controllers should deal with cgroup_subsys_state on/offline state instead of cgroup removal state. Make it static. While at it, make it return bool. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1df5f699be6..8d9f3c911fc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -538,7 +538,6 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a19419f4af1..501974823b3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,7 +226,7 @@ static int css_refcnt(struct cgroup_subsys_state *css) } /* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_removed(const struct cgroup *cgrp) { return test_bit(CGRP_REMOVED, &cgrp->flags); } -- cgit v1.2.3-70-g09d2 From 53fa5261747a90746531e8a1c81eeb78fedc2f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: add cgroup->serial_nr and implement cgroup_next_sibling() Currently, there's no easy way to find out the next sibling cgroup unless it's known that the current cgroup is accessed from the parent's children list in a single RCU critical section. This in turn forces all iterators to require whole iteration to be enclosed in a single RCU critical section, which sometimes is too restrictive. This patch implements cgroup_next_sibling() which can reliably determine the next sibling regardless of the state of the current cgroup as long as it's accessible. It currently is impossible to determine the next sibling after dropping RCU read lock because the cgroup being iterated could be removed anytime and if RCU read lock is dropped, nothing guarantess its ->sibling.next pointer is accessible. A removed cgroup would continue to point to its next sibling for RCU accesses but stop receiving updates from the sibling. IOW, the next sibling could be removed and then complete its grace period while RCU read lock is dropped, making it unsafe to dereference ->sibling.next after dropping and re-acquiring RCU read lock. This can be solved by adding a way to traverse to the next sibling without dereferencing ->sibling.next. This patch adds a monotonically increasing cgroup serial number, cgroup->serial_nr, which guarantees that all cgroup->children lists are kept in increasing serial_nr order. A new function, cgroup_next_sibling(), is implemented, which, if CGRP_REMOVED is not set on the current cgroup, follows ->sibling.next; otherwise, traverses the parent's ->children list until it sees a sibling with higher ->serial_nr. This allows the function to always return the next sibling regardless of the state of the current cgroup without adding overhead in the fast path. Further patches will update the iterators to use cgroup_next_sibling() so that they allow dropping RCU read lock and blocking while iteration is in progress which in turn will be used to simplify controllers. v2: Typo fix as per Serge. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn --- include/linux/cgroup.h | 10 ++++++++ kernel/cgroup.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8d9f3c911fc..ee041a01a67 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -188,6 +188,14 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all cgroups. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr. + * It's used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + /* * This is a copy of dentry->d_name, and it's needed because * we can't use dentry->d_name in cgroup_path(). @@ -675,6 +683,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +struct cgroup *cgroup_next_sibling(struct cgroup *pos); + /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 501974823b3..b87c7a5a549 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2975,6 +2975,55 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_sibling - find the next sibling of a given cgroup + * @pos: the current cgroup + * + * This function returns the next sibling of @pos and should be called + * under RCU read lock. The only requirement is that @pos is accessible. + * The next sibling is guaranteed to be returned regardless of @pos's + * state. + */ +struct cgroup *cgroup_next_sibling(struct cgroup *pos) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* + * @pos could already have been removed. Once a cgroup is removed, + * its ->sibling.next is no longer updated when its next sibling + * changes. As CGRP_REMOVED is set on removal which is fully + * serialized, if we see it unasserted, it's guaranteed that the + * next sibling hasn't finished its grace period even if it's + * already removed, and thus safe to dereference from this RCU + * critical section. If ->sibling.next is inaccessible, + * cgroup_is_removed() is guaranteed to be visible as %true here. + */ + if (likely(!cgroup_is_removed(pos))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return next; + return NULL; + } + + /* + * Can't dereference the next pointer. Each cgroup is given a + * monotonically increasing unique serial number and always + * appended to the sibling list, so the next one can be found by + * walking the parent's children until we see a cgroup with higher + * serial number than @pos's. + * + * While this path can be slow, it's taken only when either the + * current cgroup is removed or iteration and removal race. + */ + list_for_each_entry_rcu(next, &pos->parent->children, sibling) + if (next->serial_nr > pos->serial_nr) + return next; + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_sibling); + /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) @@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0); struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); + /* + * Assign a monotonically increasing serial number. With the list + * appending below, it guarantees that sibling cgroups are always + * sorted in the ascending serial number order on the parent's + * ->children. + */ + cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor); + /* allocation complete, commit to creation */ list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); @@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * removed. This makes future css_tryget() and child creation * attempts fail thus maintaining the removal conditions verified * above. + * + * Note that CGRP_REMVOED clearing is depended upon by + * cgroup_next_sibling() to resume iteration after dropping RCU + * read lock. See cgroup_next_sibling() for details. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v1.2.3-70-g09d2 From 75501a6d59e989e5c286716e5b3b66ace4660e83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: update iterators to use cgroup_next_sibling() This patch converts cgroup_for_each_child(), cgroup_next_descendant_pre/post() and thus cgroup_for_each_descendant_pre/post() to use cgroup_next_sibling() instead of manually dereferencing ->sibling.next. The only reason the iterators couldn't allow dropping RCU read lock while iteration is in progress was because they couldn't determine the next sibling safely once RCU read lock is dropped. Using cgroup_next_sibling() removes that problem and enables all iterators to allow dropping RCU read lock in the middle. Comments are updated accordingly. This makes the iterators easier to use and will simplify controllers. Note that @cgroup argument is renamed to @cgrp in cgroup_for_each_child() because it conflicts with "struct cgroup" used in the new macro body. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Reviewed-by: Michal Hocko --- include/linux/cgroup.h | 18 ++++++++++++++---- kernel/cgroup.c | 25 +++++++++++++++++++------ 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ee041a01a67..d0ad3794b94 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -688,9 +688,9 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose children to walk + * @cgrp: cgroup whose children to walk * - * Walk @cgroup's children. Must be called under rcu_read_lock(). A child + * Walk @cgrp's children. Must be called under rcu_read_lock(). A child * cgroup which hasn't finished ->css_online() or already has finished * ->css_offline() may show up during traversal and it's each subsystem's * responsibility to verify that each @pos is alive. @@ -698,9 +698,15 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, a cgroup which finished ->css_online() is * guaranteed to be visible in the future iterations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_child(pos, cgroup) \ - list_for_each_entry_rcu(pos, &(cgroup)->children, sibling) +#define cgroup_for_each_child(pos, cgrp) \ + for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ + struct cgroup, sibling); \ + (pos); (pos) = cgroup_next_sibling((pos))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); @@ -759,6 +765,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ #define cgroup_for_each_descendant_pre(pos, cgroup) \ for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b87c7a5a549..fefc41c1a14 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3031,6 +3031,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_sibling); * * To be used by cgroup_for_each_descendant_pre(). Find the next * descendant to visit for pre-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup) @@ -3050,11 +3055,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, - sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return next; - pos = pos->parent; } @@ -3069,6 +3072,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * Return the rightmost descendant of @pos. If there's no descendant, * @pos is returned. This can be used during pre-order traversal to skip * subtree of @pos. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct rightmost descendant as long as @pos is + * accessible. */ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) { @@ -3108,6 +3116,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) * * To be used by cgroup_for_each_descendant_post(). Find the next * descendant to visit for post-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, struct cgroup *cgroup) @@ -3123,8 +3136,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return cgroup_leftmost_descendant(next); /* no sibling left, visit parent */ -- cgit v1.2.3-70-g09d2 From 4eedb77a9cd8f2e68b31c8b9a20524a50727c16f Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Fri, 17 May 2013 10:51:33 +0200 Subject: locking: Fix copy/paste errors of "ARCH_INLINE_*_UNLOCK_BH" The Kconfig symbols ARCH_INLINE_READ_UNLOCK_IRQ, ARCH_INLINE_SPIN_UNLOCK_IRQ, and ARCH_INLINE_WRITE_UNLOCK_IRQ were added in v2.6.33, but have never actually been used. Ingo Molnar spotted that this is caused by three identical copy/paste erros. Eg, the Kconfig entry for INLINE_READ_UNLOCK_IRQ has an (optional) dependency on: ARCH_INLINE_READ_UNLOCK_BH were it apparently should depend on: ARCH_INLINE_READ_UNLOCK_IRQ instead. Likewise for the Kconfig entries for INLINE_SPIN_UNLOCK_IRQ and INLINE_WRITE_UNLOCK_IRQ. Fix these three errors. This never really caused any real problems as these symbols are set (or unset) in a group - but it's worth fixing it nevertheless. Reported-by: Ingo Molnar Signed-off-by: Paul Bolle Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1368780693.1350.228.camel@x61.thuisdomein Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100ea..d2b32ac27a3 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y -- cgit v1.2.3-70-g09d2 From ab573844e3058eef2788803d373019f8bebead57 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 1 May 2013 17:25:44 +0200 Subject: perf: Fix hw breakpoints overflow period sampling The hw breakpoint pmu 'add' function is missing the period_left update needed for SW events. The perf HW breakpoint events use the SW events framework to process the overflow, so it needs to be properly initialized in the PMU 'add' method. Signed-off-by: Jiri Olsa Reviewed-by: Peter Zijlstra Cc: H. Peter Anvin Cc: Oleg Nesterov Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Vince Weaver Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1367421944-19082-5-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f463a46424e..fa38612d70b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,6 +743,7 @@ extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); +extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); @@ -782,6 +783,7 @@ static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } +static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c..e0dcced282e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4961,7 +4961,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); * sign as trigger. */ -static u64 perf_swevent_set_period(struct perf_event *event) +u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1..966a241e861 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -612,6 +612,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags) if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(bp)) { + bp->hw.last_period = bp->hw.sample_period; + perf_swevent_set_period(bp); + } + return arch_install_hw_breakpoint(bp); } -- cgit v1.2.3-70-g09d2 From 9e6302056f8029f438e853432a856b9f13de26a6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Apr 2013 14:21:33 +0200 Subject: perf: Use hrtimers for event multiplexing The current scheme of using the timer tick was fine for per-thread events. However, it was causing bias issues in system-wide mode (including for uncore PMUs). Event groups would not get their fair share of runtime on the PMU. With tickless kernels, if a core is idle there is no timer tick, and thus no event rotation (multiplexing). However, there are events (especially uncore events) which do count even though cores are asleep. This patch changes the timer source for multiplexing. It introduces a per-PMU per-cpu hrtimer. The advantage is that even when a core goes idle, it will come back to service the hrtimer, thus multiplexing on system-wide events works much better. The per-PMU implementation (suggested by PeterZ) enables adjusting the multiplexing interval per PMU. The preferred interval is stashed into the struct pmu. If not set, it will be forced to the default interval value. In order to minimize the impact of the hrtimer, it is turned on and off on demand. When the PMU on a CPU is overcommited, the hrtimer is activated. It is stopped when the PMU is not overcommitted. In order for this to work properly, we had to change the order of initialization in start_kernel() such that hrtimer_init() is run before perf_event_init(). The default interval in milliseconds is set to a timer tick just like with the old code. We will provide a sysctl to tune this in another patch. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1364991694-5876-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +- init/main.c | 2 +- kernel/events/core.c | 114 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 109 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fa38612d70b..72138d75a60 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -501,8 +501,9 @@ struct perf_cpu_context { struct perf_event_context *task_ctx; int active_oncpu; int exclusive; + struct hrtimer hrtimer; + ktime_t hrtimer_interval; struct list_head rotation_list; - int jiffies_interval; struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; diff --git a/init/main.c b/init/main.c index 9484f4ba88d..ec549581d73 100644 --- a/init/main.c +++ b/init/main.c @@ -542,7 +542,6 @@ asmlinkage void __init start_kernel(void) if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); idr_init_cache(); - perf_event_init(); rcu_init(); tick_nohz_init(); radix_tree_init(); @@ -555,6 +554,7 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); + perf_event_init(); profile_init(); call_function_init(); WARN(!irqs_disabled(), "Interrupts were enabled early\n"); diff --git a/kernel/events/core.c b/kernel/events/core.c index e0dcced282e..97bfac7e6f4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -170,6 +170,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_rotate_context(struct perf_cpu_context *cpuctx); + int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -658,6 +660,98 @@ perf_cgroup_mark_enabled(struct perf_event *event, } #endif +/* + * set default to be dependent on timer tick just + * like original code + */ +#define PERF_CPU_HRTIMER (1000 / HZ) +/* + * function must be called with interrupts disbled + */ +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +{ + struct perf_cpu_context *cpuctx; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int rotations = 0; + + WARN_ON(!irqs_disabled()); + + cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); + + rotations = perf_rotate_context(cpuctx); + + /* + * arm timer if needed + */ + if (rotations) { + hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + ret = HRTIMER_RESTART; + } + + return ret; +} + +/* CPU is going down */ +void perf_cpu_hrtimer_cancel(int cpu) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (WARN_ON(cpu != smp_processor_id())) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (pmu->task_ctx_nr == perf_sw_context) + continue; + + hrtimer_cancel(&cpuctx->hrtimer); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* no multiplexing needed for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + cpuctx->hrtimer_interval = + ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER); + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + hr->function = perf_cpu_hrtimer_handler; +} + +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* not for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + if (hrtimer_active(hr)) + return; + + if (!hrtimer_callback_running(hr)) + __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, + 0, HRTIMER_MODE_REL_PINNED, 0); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1506,6 +1600,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1552,6 +1647,8 @@ group_error: pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); + return -EAGAIN; } @@ -1807,8 +1904,10 @@ static int __perf_event_enable(void *info) * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != event) + if (leader != event) { group_sched_out(leader, cpuctx, ctx); + perf_cpu_hrtimer_restart(cpuctx); + } if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_EVENT_STATE_ERROR; @@ -2555,7 +2654,7 @@ static void rotate_ctx(struct perf_event_context *ctx) * because they're strictly cpu affine and rotate_start is called with IRQs * disabled, while rotate_context is called from IRQ context. */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) +static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; int rotate = 0, remove = 1; @@ -2594,6 +2693,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) done: if (remove) list_del_init(&cpuctx->rotation_list); + + return rotate; } #ifdef CONFIG_NO_HZ_FULL @@ -2625,10 +2726,6 @@ void perf_event_task_tick(void) ctx = cpuctx->task_ctx; if (ctx) perf_adjust_freq_unthr_context(ctx, throttled); - - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); } } @@ -6001,7 +6098,9 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; + + __perf_cpu_hrtimer_init(cpuctx, cpu); + INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -7387,7 +7486,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; - default: break; } -- cgit v1.2.3-70-g09d2 From 62b8563979273424d6ebe9201e34d1acc133ad4f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Apr 2013 14:21:34 +0200 Subject: perf: Add sysfs entry to adjust multiplexing interval per PMU This patch adds /sys/device/xxx/perf_event_mux_interval_ms to ajust the multiplexing interval per PMU. The unit is milliseconds. Value has to be >= 1. In the 4th version, we renamed the sysfs file to be more consistent with the other /proc/sys/kernel entries for perf_events. In the 5th version, we handle the reprogramming of the hrtimer using hrtimer_forward_now(). That way, we sync up to new timer value quickly (suggested by Jiri Olsa). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1364991694-5876-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 63 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 72138d75a60..6fddac1b27c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -194,6 +194,7 @@ struct pmu { int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; int task_ctx_nr; + int hrtimer_interval_ms; /* * Fully disable/enable this PMU, can be used to protect from the PMI diff --git a/kernel/events/core.c b/kernel/events/core.c index 97bfac7e6f4..53d1b300116 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -723,13 +723,21 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { struct hrtimer *hr = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; + int timer; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) return; - cpuctx->hrtimer_interval = - ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER); + /* + * check default is sane, if not set then force to + * default interval (1/tick) + */ + timer = pmu->hrtimer_interval_ms; + if (timer < 1) + timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); hr->function = perf_cpu_hrtimer_handler; @@ -6001,9 +6009,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static ssize_t +perf_event_mux_interval_ms_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); +} + +static ssize_t +perf_event_mux_interval_ms_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pmu *pmu = dev_get_drvdata(dev); + int timer, cpu, ret; + + ret = kstrtoint(buf, 0, &timer); + if (ret) + return ret; + + if (timer < 1) + return -EINVAL; + + /* same value, noting to do */ + if (timer == pmu->hrtimer_interval_ms) + return count; + + pmu->hrtimer_interval_ms = timer; + + /* update all cpuctx for this PMU */ + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + if (hrtimer_active(&cpuctx->hrtimer)) + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + } + + return count; +} + +#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) + static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, + __ATTR_RO(type), + __ATTR_RW(perf_event_mux_interval_ms), + __ATTR_NULL, }; static int pmu_bus_running; -- cgit v1.2.3-70-g09d2 From 2b923c8f5de6722393e614b096d5040b6d4eaf98 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 21 May 2013 12:53:37 +0200 Subject: perf/x86: Check branch sampling priv level in generic code This patch moves commit 7cc23cd to the generic code: perf/x86/intel/lbr: Demand proper privileges for PERF_SAMPLE_BRANCH_KERNEL The check is now implemented in generic code instead of x86 specific code. That way we do not have to repeat the test in each arch supporting branch sampling. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20130521105337.GA2879@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 13 +++---------- kernel/events/core.c | 9 ++++----- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d978353c939..de341d4ec92 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -310,7 +310,7 @@ void intel_pmu_lbr_read(void) * - in case there is no HW filter * - in case the HW filter has errata or limitations */ -static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) { u64 br_type = event->attr.branch_sample_type; int mask = 0; @@ -318,11 +318,8 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_USER) mask |= X86_BR_USER; - if (br_type & PERF_SAMPLE_BRANCH_KERNEL) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) mask |= X86_BR_KERNEL; - } /* we ignore BRANCH_HV here */ @@ -342,8 +339,6 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) * be used by fixup code for some CPU */ event->hw.branch_reg.reg = mask; - - return 0; } /* @@ -391,9 +386,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) /* * setup SW LBR filter */ - ret = intel_pmu_setup_sw_lbr_filter(event); - if (ret) - return ret; + intel_pmu_setup_sw_lbr_filter(event); /* * setup HW LBR filter, if any diff --git a/kernel/events/core.c b/kernel/events/core.c index 53d1b300116..a0780b3a3d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6481,11 +6481,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; - /* kernel level capture: check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { @@ -6503,6 +6498,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, */ attr->branch_sample_type = mask; } + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_KERNEL) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { -- cgit v1.2.3-70-g09d2 From 0c1061733aa0303e6536c0bc7f86d68f5eb55446 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 15 May 2013 11:04:10 +0200 Subject: rtmutex: Document rt_mutex_adjust_prio_chain() Parameters and usage of rt_mutex_adjust_prio_chain() are already documented in Documentation/rt-mutex-design.txt. However, since this function is called from several paths with different semantics (related to the arguments), it is handy to have a quick reference directly in the code. Signed-off-by: Juri Lelli Cc: Clark Williams Cc: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1368608650-7935-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 1e09308bf2a..0dd6aec1cb6 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -145,6 +145,19 @@ int max_lock_depth = 1024; /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. + * + * @task: the task owning the mutex (owner) for which a chain walk is probably + * needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter + * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, -- cgit v1.2.3-70-g09d2 From c7e99fc75de8882bc4104455ace366d9d3599a96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:28:02 +0200 Subject: clockevents: Define CS_NAME_LEN unconditionally Unbreak architectures which do not use clockevents, but require to build some of the core timekeeping infrastructure Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/tick-internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be1690eaecf..bc906cad709 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,13 +6,13 @@ extern seqlock_t jiffies_lock; +#define CS_NAME_LEN 32 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 -#define CS_NAME_LEN 32 - DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; -- cgit v1.2.3-70-g09d2 From 41261b6a832ea0e788627f6a8707854423f9ff49 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 24 May 2013 18:07:49 +0200 Subject: sched/autogroup: Fix race with task_groups list In autogroup_create(), a tg is allocated and added to the task_groups list. If CONFIG_RT_GROUP_SCHED is set, this tg is then modified while on the list, without locking. This can race with someone walking the list, like __enable_runtime() during CPU unplug, and result in a use-after-free bug. To fix this, move sched_online_group(), which adds the tg to the list, to the end of the autogroup_create() function after the modification. Signed-off-by: Gerald Schaefer Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369411669-46971-2-git-send-email-gerald.schaefer@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9..4a073539c58 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; - sched_online_group(tg, &root_task_group); - kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); @@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) #endif tg->autogroup = ag; + sched_online_group(tg, &root_task_group); return ag; out_free: -- cgit v1.2.3-70-g09d2 From c5405a495e88d93cf9b4f4cc91507c7f4afcb901 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Thu, 11 Apr 2013 21:04:59 +0800 Subject: sched: Remove redundant update_runtime notifier migration_call() will do all the things that update_runtime() does. So let's remove it. Furthermore, there is potential risk that the current code will catch BUG_ON at line 689 of rt.c when do cpu hotplug while there are realtime threads running because of enabling runtime twice while the rt_runtime may already changed. Signed-off-by: Neil Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365685499-26515-1-git-send-email-zhangwm@marvell.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- kernel/sched/rt.c | 40 ---------------------------------------- kernel/sched/sched.h | 1 - 3 files changed, 44 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfa7e77e0b5..79e48e6a938 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6285,9 +6285,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - init_hrtick(); /* Move init over to a non-isolated CPU */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aced2e3b08..8853ab17b75 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -699,15 +699,6 @@ balanced: } } -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static void __enable_runtime(struct rq *rq) { rt_rq_iter_t iter; @@ -732,37 +723,6 @@ static void __enable_runtime(struct rq *rq) } } -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1f6256c122..c806c61a126 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1041,7 +1041,6 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -- cgit v1.2.3-70-g09d2 From 77bd39702f0b3840cea17681409270b16a3b93c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:50:58 +0200 Subject: sched: Update rq clock before migrating tasks out of dying CPU Because the sched_class::put_prev_task() callback of rt and fair classes are referring to the rq clock to update their runtime statistics. There is a missing rq clock update from the CPU hotplug notifier's entry point of the scheduler. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79e48e6a938..7bf0418dc60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4378,6 +4378,13 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* + * put_prev_task() and pick_next_task() sched + * class method both need to have an up-to-date + * value of rq->clock[_task] + */ + update_rq_clock(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only -- cgit v1.2.3-70-g09d2 From 71b1da46ff70309a2ec12ce943aa0d192d2c8f0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:50:59 +0200 Subject: sched: Update rq clock before setting fair group shares Because we may update the execution time in sched_group_set_shares()->update_cfs_shares()->reweight_entity()->update_curr() before reweighting the entity while setting the group shares and this requires an uptodate version of the runqueue clock. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f62b16dfba6..f76ca21711b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6107,6 +6107,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); + + /* Possible calls to update_curr() need rq clock */ + update_rq_clock(rq); for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3-70-g09d2 From 1ad4ec0dc740c4183acd6d6e367ca52b28e4fa94 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:00 +0200 Subject: sched: Update rq clock before calling check_preempt_curr() check_preempt_curr() of fair class needs an uptodate sched clock value to update runtime stats of the current task of the target's rq. When a task is woken up, activate_task() is usually called right before ttwu_do_wakeup() unless the task is still in the runqueue. In the latter case we need to update the rq clock explicitly because activate_task() isn't here to do the job for us. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bf0418dc60..46d00172ae4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1365,6 +1365,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p); if (p->on_rq) { + /* check_preempt_curr() may use rq clock */ + update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } -- cgit v1.2.3-70-g09d2 From 1a55af2e45cce0ff13bc33c8ee99da84e188b615 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:01 +0200 Subject: sched: Update rq clock earlier in unthrottle_cfs_rq In this function we are making use of rq->clock right before the update of the rq clock, let's just call update_rq_clock() just before that to avoid using a stale rq clock value. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f76ca21711b..1c8762a5370 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2319,12 +2319,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; cfs_rq->throttled = 0; + + update_rq_clock(rq); + raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - update_rq_clock(rq); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); -- cgit v1.2.3-70-g09d2 From 78becc27097585c6aec7043834cadde950ae79f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:02 +0200 Subject: sched: Use an accessor to read the rq clock Read the runqueue clock through an accessor. This prepares for adding a debugging infrastructure to detect missing or redundant calls to update_rq_clock() between a scheduler's entry and exit point. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 44 ++++++++++++++++++++++---------------------- kernel/sched/rt.c | 8 ++++---- kernel/sched/sched.h | 10 ++++++++++ kernel/sched/stats.h | 8 ++++---- kernel/sched/stop_task.c | 8 ++++---- 6 files changed, 47 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46d00172ae4..36f85be2932 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -667,7 +667,7 @@ void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); - while ((s64)(rq->clock - rq->age_stamp) > period) { + while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { /* * Inline assembly required to prevent the compiler * optimising this loop into a divmod call. @@ -1328,7 +1328,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; + u64 delta = rq_clock(rq) - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; if (delta > max) @@ -2106,7 +2106,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock_task - p->se.exec_start; + ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) ns = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c8762a5370..3ee1c2e4ae6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -704,7 +704,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; + u64 now = rq_clock_task(rq_of(cfs_rq)); unsigned long delta_exec; if (unlikely(!curr)) @@ -736,7 +736,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } /* @@ -756,14 +756,14 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); } #endif schedstat_set(se->statistics.wait_start, 0); @@ -789,7 +789,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock_task; + se->exec_start = rq_clock_task(rq_of(cfs_rq)); } /************************************************** @@ -1515,7 +1515,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } @@ -1530,7 +1530,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, * accumulated while sleeping. */ if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); if (se->avg.decay_count) { /* * In a wake-up migration we have to approximate the @@ -1625,7 +1625,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) tsk = task_of(se); if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; @@ -1642,7 +1642,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } } if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; if ((s64)delta < 0) delta = 0; @@ -1823,9 +1823,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); } #endif } @@ -2100,7 +2100,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) if (unlikely(cfs_rq->throttle_count)) return cfs_rq->throttled_clock_task; - return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; + return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } /* returns 0 on failure to allocate runtime */ @@ -2159,7 +2159,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) return; if (cfs_rq->runtime_remaining < 0) @@ -2248,7 +2248,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ - cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } #endif @@ -2263,7 +2263,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - cfs_rq->throttled_clock_task = rq->clock_task; + cfs_rq->throttled_clock_task = rq_clock_task(rq); cfs_rq->throttle_count++; return 0; @@ -2302,7 +2302,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq->clock; + cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -2323,7 +2323,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -2726,7 +2726,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) #else /* CONFIG_CFS_BANDWIDTH */ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { - return rq_of(cfs_rq)->clock_task; + return rq_clock_task(rq_of(cfs_rq)); } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, @@ -3966,7 +3966,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4322,7 +4322,7 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq->clock - age_stamp); + total = sched_avg_period() + (rq_clock(rq) - age_stamp); if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ @@ -5261,7 +5261,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - this_rq->idle_stamp = this_rq->clock; + this_rq->idle_stamp = rq_clock(this_rq); if (this_rq->avg_idle < sysctl_sched_migration_cost) return; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8853ab17b75..8d85f9ac426 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -886,7 +886,7 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -896,7 +896,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1345,7 +1345,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); return p; } @@ -1997,7 +1997,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c806c61a126..74ff659e964 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -548,6 +548,16 @@ DECLARE_PER_CPU(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +static inline u64 rq_clock(struct rq *rq) +{ + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + return rq->clock_task; +} + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5..17d7065c387 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) */ static inline void sched_info_dequeued(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = task_rq(t)->clock; + t->sched_info.last_queued = rq_clock(task_rq(t)); } /* @@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - unsigned long long delta = task_rq(t)->clock - + unsigned long long delta = rq_clock(task_rq(t)) - t->sched_info.last_arrival; rq_sched_info_depart(task_rq(t), delta); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84..e08fbeeb54b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) struct task_struct *stop = rq->stop; if (stop && stop->on_rq) { - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); return stop; } @@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) struct task_struct *curr = rq->curr; u64 delta_exec; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); } @@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); } static void switched_to_stop(struct rq *rq, struct task_struct *p) -- cgit v1.2.3-70-g09d2 From 1eaff67266b6b6c97bbd33cf2c20577822836413 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:48:46 +0200 Subject: clocksource: Implement clocksource_select_fallback() for CONFIG_ARCH_USES_GETTIMEOFFSET=y commit 7eaeb34305 (clocksource: Provide unbind interface in sysfs) implemented clocksource_select_fallback() which is not defined for CONFIG_ARCH_USES_GETTIMEOFFSET=y. Add an empty inline function for that. Reported-by: Ingo Molnar Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6d05b00410c..e713ef7d19a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -632,6 +632,7 @@ static void clocksource_select_fallback(void) #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif -- cgit v1.2.3-70-g09d2 From a6572f84c5b135d9b6df279ed3c8de028bd1edd9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 17 May 2013 10:31:04 +0800 Subject: watchdog: Disallow setting watchdog_thresh to -1 In old kernels, it's allowed to set softlockup_thresh to -1 or 0 to disable softlockup detection. However watchdog_thresh only uses 0 to disable detection, and setting it to -1 just froze my box and nothing I can do but reboot. Signed-off-by: Li Zefan Acked-by: Don Zickus Link: http://lkml.kernel.org/r/51959668.9040106@huawei.com Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0f..b0a1f99907f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -120,7 +120,6 @@ extern int blk_iopoll_enabled; /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; -static int neg_one = -1; #endif static int zero; @@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dowatchdog, - .extra1 = &neg_one, + .extra1 = &zero, .extra2 = &sixty, }, { -- cgit v1.2.3-70-g09d2 From 84f9f3a15611536537d59060818a2354d5039ff3 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 2 May 2013 15:34:33 +0200 Subject: sched: Use swap() macro in scale_stime() Simple cleanup. Reported-by: Peter Zijlstra Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1367501673-6563-1-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a..94691bcd736 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) for (;;) { /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } + if (stime > rtime) + swap(rtime, stime); /* Make sure 'total' fits in 32 bits */ if (total >> 32) -- cgit v1.2.3-70-g09d2 From cfeaa93f8a13ae9117ae20933a38a406de80849e Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 6 May 2013 14:30:17 +0000 Subject: genirq: Generic chip: Remove the local cur_regs() function Since we already have an irq_data_get_chip_type() function which returns a pointer to irq_chip_type, use that instead of cur_regs(). Signed-off-by: Gerlando Falauto Cc: Andrew Lunn Cc: Joey Oravec Cc: Lennert Buytenhek Cc: Russell King - ARM Linux Cc: Jason Gunthorpe Cc: Holger Brunck Cc: Ezequiel Garcia Acked-by: Grant Likely Cc: Sebastian Hesselbarth Cc: Jason Cooper Cc: Arnd Bergmann Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Ben Dooks Cc: Gregory Clement Cc: Simon Guinot Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Petazzoni Cc: Jean-Francois Moine Cc: Nicolas Pitre Cc: Rob Landley Cc: Maxime Ripard Link: http://lkml.kernel.org/r/20130506142539.010164766@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f66..0e6ba789056 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -16,11 +16,6 @@ static LIST_HEAD(gc_list); static DEFINE_RAW_SPINLOCK(gc_lock); -static inline struct irq_chip_regs *cur_regs(struct irq_data *d) -{ - return &container_of(d->chip, struct irq_chip_type, chip)->regs; -} - /** * irq_gc_noop - NOOP function * @d: irq_data @@ -39,10 +34,11 @@ void irq_gc_noop(struct irq_data *d) void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); + irq_reg_writel(mask, gc->reg_base + ct->regs.disable); gc->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -57,11 +53,12 @@ void irq_gc_mask_disable_reg(struct irq_data *d) void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -75,11 +72,12 @@ void irq_gc_mask_set_bit(struct irq_data *d) void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -93,10 +91,11 @@ void irq_gc_mask_clr_bit(struct irq_data *d) void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); + irq_reg_writel(mask, gc->reg_base + ct->regs.enable); gc->mask_cache |= mask; irq_gc_unlock(gc); } @@ -108,10 +107,11 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -122,10 +122,11 @@ void irq_gc_ack_set_bit(struct irq_data *d) void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = ~(1 << (d->irq - gc->irq_base)); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -136,11 +137,12 @@ void irq_gc_ack_clr_bit(struct irq_data *d) void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.mask); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -151,10 +153,11 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); irq_gc_unlock(gc); } -- cgit v1.2.3-70-g09d2 From 899f0e66fff36ebb6dd6a83af9aa631f6cb7e0dc Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 6 May 2013 14:30:19 +0000 Subject: genirq: Generic chip: Add support for per chip type mask cache Today the same interrupt mask cache (stored within struct irq_chip_generic) is shared between all the irq_chip_type instances. As there are instances where each irq_chip_type uses a distinct mask register (as it is the case for Orion SoCs), sharing a single mask cache may be incorrect. So add a distinct pointer for each irq_chip_type, which for now points to the original mask register within irq_chip_generic. So no functional changes here. [ tglx: Minor cosmetic tweaks ] Reported-by: Joey Oravec Signed-off-by: Simon Guinot Signed-off-by: Holger Brunck Signed-off-by: Gerlando Falauto Cc: Andrew Lunn Cc: Lennert Buytenhek Cc: Russell King - ARM Linux Cc: Jason Gunthorpe Cc: Holger Brunck Cc: Ezequiel Garcia Acked-by: Grant Likely Cc: Sebastian Hesselbarth Cc: Jason Cooper Cc: Arnd Bergmann Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Ben Dooks Cc: Gregory Clement Cc: Simon Guinot Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Petazzoni Cc: Jean-Francois Moine Cc: Nicolas Pitre Cc: Rob Landley Cc: Maxime Ripard Link: http://lkml.kernel.org/r/20130506142539.082226607@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 6 +++++- kernel/irq/generic-chip.c | 16 ++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index bc4e0661195..38709a3ab1c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -644,6 +644,8 @@ struct irq_chip_regs { * @regs: Register offsets for this chip * @handler: Flow handler associated with this chip * @type: Chip can handle these flow types + * @mask_cache_priv: Cached mask register private to the chip type + * @mask_cache: Pointer to cached mask register * * A irq_generic_chip can have several instances of irq_chip_type when * it requires different functions and register offsets for different @@ -654,6 +656,8 @@ struct irq_chip_type { struct irq_chip_regs regs; irq_flow_handler_t handler; u32 type; + u32 mask_cache_priv; + u32 *mask_cache; }; /** @@ -662,7 +666,7 @@ struct irq_chip_type { * @reg_base: Register base address (virtual) * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip - * @mask_cache: Cached mask register + * @mask_cache: Cached mask register shared between all chip types * @type_cache: Cached type register * @polarity_cache: Cached polarity register * @wake_enabled: Interrupt can wakeup from suspend diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 0e6ba789056..113d9ebfe0a 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.disable); - gc->mask_cache &= ~mask; + *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -57,8 +57,8 @@ void irq_gc_mask_set_bit(struct irq_data *d) u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); + *ct->mask_cache |= mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -76,8 +76,8 @@ void irq_gc_mask_clr_bit(struct irq_data *d) u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); + *ct->mask_cache &= ~mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -96,7 +96,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.enable); - gc->mask_cache |= mask; + *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -250,6 +250,10 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (flags & IRQ_GC_INIT_MASK_CACHE) gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + /* Initialize mask cache pointer */ + for (i = 0; i < gc->num_ct; i++) + ct[i].mask_cache = &gc->mask_cache; + for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) continue; -- cgit v1.2.3-70-g09d2 From af80b0fed67261dcba2ce2406db1d553d07cbe75 Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 6 May 2013 14:30:21 +0000 Subject: genirq: Generic chip: Handle separate mask registers There are cases where all irq_chip_type instances have separate mask registers, making a shared mask register cache unsuitable for the purpose. Introduce a new flag IRQ_GC_MASK_CACHE_PER_TYPE. If set, point the per chip mask pointer to the per chip private mask cache instead. [ tglx: Simplified code, renamed flag and massaged changelog ] Signed-off-by: Gerlando Falauto Cc: Andrew Lunn Cc: Joey Oravec Cc: Lennert Buytenhek Cc: Russell King - ARM Linux Cc: Jason Gunthorpe Cc: Holger Brunck Cc: Ezequiel Garcia Acked-by: Grant Likely Cc: Sebastian Hesselbarth Cc: Jason Cooper Cc: Arnd Bergmann Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Ben Dooks Cc: Gregory Clement Cc: Simon Guinot Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Petazzoni Cc: Jean-Francois Moine Cc: Nicolas Pitre Cc: Rob Landley Cc: Maxime Ripard Link: http://lkml.kernel.org/r/20130506142539.152569748@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 ++ kernel/irq/generic-chip.c | 17 ++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 38709a3ab1c..7f1f0157fd0 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -704,10 +704,12 @@ struct irq_chip_generic { * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations + * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, + IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, }; /* Generic chip callback functions */ diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 113d9ebfe0a..da2a94191fc 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -241,18 +241,21 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, { struct irq_chip_type *ct = gc->chip_types; unsigned int i; + u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; raw_spin_lock(&gc_lock); list_add_tail(&gc->list, &gc_list); raw_spin_unlock(&gc_lock); - /* Init mask cache ? */ - if (flags & IRQ_GC_INIT_MASK_CACHE) - gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); - - /* Initialize mask cache pointer */ - for (i = 0; i < gc->num_ct; i++) - ct[i].mask_cache = &gc->mask_cache; + for (i = 0; i < gc->num_ct; i++) { + if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { + mskptr = &ct[i].mask_cache_priv; + mskreg = ct[i].regs.mask; + } + ct[i].mask_cache = mskptr; + if (flags & IRQ_GC_INIT_MASK_CACHE) + *mskptr = irq_reg_readl(gc->reg_base + mskreg); + } for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) -- cgit v1.2.3-70-g09d2 From 966dc736b819999cd2d3a6408d47d33b579f7d56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:22 +0000 Subject: genirq: Generic chip: Cache per irq bit mask Cache the per irq bit mask instead of recalculating it over and over. Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.227119865@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ kernel/irq/generic-chip.c | 23 ++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7f1f0157fd0..d5fc7f5a49b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -119,6 +119,7 @@ struct irq_domain; /** * struct irq_data - per irq and irq chip data passed down to chip functions + * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @node: node index useful for balancing @@ -138,6 +139,7 @@ struct irq_domain; * irq_data. */ struct irq_data { + u32 mask; unsigned int irq; unsigned long hwirq; unsigned int node; @@ -705,11 +707,13 @@ struct irq_chip_generic { * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private + * @IRQ_GC_NO_MASK: Do not calculate irq_data->mask */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, + IRQ_GC_NO_MASK = 1 << 3, }; /* Generic chip callback functions */ diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index da2a94191fc..957155cebba 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -35,7 +35,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.disable); @@ -54,7 +54,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); *ct->mask_cache |= mask; @@ -73,7 +73,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); *ct->mask_cache &= ~mask; @@ -92,7 +92,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.enable); @@ -108,7 +108,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.ack); @@ -123,7 +123,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = ~(1 << (d->irq - gc->irq_base)); + u32 mask = ~d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.ack); @@ -138,7 +138,7 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.mask); @@ -154,7 +154,7 @@ void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); @@ -172,7 +172,7 @@ void irq_gc_eoi(struct irq_data *d) int irq_gc_set_wake(struct irq_data *d, unsigned int on) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; if (!(mask & gc->wake_enabled)) return -EINVAL; @@ -264,6 +264,11 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (flags & IRQ_GC_INIT_NESTED_LOCK) irq_set_lockdep_class(i, &irq_nested_lock_class); + if (!(flags & IRQ_GC_NO_MASK)) { + struct irq_data *d = irq_get_irq_data(i); + + d->mask = 1 << (i - gc->irq_base); + } irq_set_chip_and_handler(i, &ct->chip, ct->handler); irq_set_chip_data(i, gc); irq_modify_status(i, clr, set); -- cgit v1.2.3-70-g09d2 From d0051816e619f8f082582bec07ffa51bdb4f2104 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:24 +0000 Subject: genirq: irqchip: Add a mask calculation function Some chips have weird bit mask access patterns instead of the linear you expect. Allow them to calculate the cached mask themself. Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.302898834@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 +++ kernel/irq/generic-chip.c | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index d5fc7f5a49b..ab8169faaa6 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -296,6 +296,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_suspend: function called from core code on suspend once per chip * @irq_resume: function called from core code on resume once per chip * @irq_pm_shutdown: function called from core code on shutdown once per chip + * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts * @flags: chip specific flags */ @@ -327,6 +328,8 @@ struct irq_chip { void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); + void (*irq_calc_mask)(struct irq_data *data); + void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); unsigned long flags; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 957155cebba..5068fe3ae1a 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -240,6 +240,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int set) { struct irq_chip_type *ct = gc->chip_types; + struct irq_chip *chip = &ct->chip; unsigned int i; u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; @@ -267,9 +268,12 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); - d->mask = 1 << (i - gc->irq_base); + if (chip->irq_calc_mask) + chip->irq_calc_mask(d); + else + d->mask = 1 << (i - gc->irq_base); } - irq_set_chip_and_handler(i, &ct->chip, ct->handler); + irq_set_chip_and_handler(i, chip, ct->handler); irq_set_chip_data(i, gc); irq_modify_status(i, clr, set); } -- cgit v1.2.3-70-g09d2 From 3528d82b684680b72fa31881c8c572c5a98b51de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:25 +0000 Subject: genirq: Generic chip: Split out code into separate functions Preparatory patch for linear interrupt domains. Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.377017672@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 50 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5068fe3ae1a..3deb3333d53 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -186,6 +186,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; +} + /** * irq_alloc_generic_chip - Allocate a generic chip and initialize it * @name: Name of the irq chip @@ -206,17 +219,31 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, gc = kzalloc(sz, GFP_KERNEL); if (gc) { - raw_spin_lock_init(&gc->lock); - gc->num_ct = num_ct; - gc->irq_base = irq_base; - gc->reg_base = reg_base; - gc->chip_types->chip.name = name; - gc->chip_types->handler = handler; + irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, + handler); } return gc; } EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); +static void +irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) +{ + struct irq_chip_type *ct = gc->chip_types; + u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; + int i; + + for (i = 0; i < gc->num_ct; i++) { + if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { + mskptr = &ct[i].mask_cache_priv; + mskreg = ct[i].regs.mask; + } + ct[i].mask_cache = mskptr; + if (flags & IRQ_GC_INIT_MASK_CACHE) + *mskptr = irq_reg_readl(gc->reg_base + mskreg); + } +} + /* * Separate lockdep class for interrupt chip which can nest irq_desc * lock. @@ -242,21 +269,12 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, struct irq_chip_type *ct = gc->chip_types; struct irq_chip *chip = &ct->chip; unsigned int i; - u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; raw_spin_lock(&gc_lock); list_add_tail(&gc->list, &gc_list); raw_spin_unlock(&gc_lock); - for (i = 0; i < gc->num_ct; i++) { - if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { - mskptr = &ct[i].mask_cache_priv; - mskreg = ct[i].regs.mask; - } - ct[i].mask_cache = mskptr; - if (flags & IRQ_GC_INIT_MASK_CACHE) - *mskptr = irq_reg_readl(gc->reg_base + mskreg); - } + irq_gc_init_mask_cache(gc, flags); for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) -- cgit v1.2.3-70-g09d2 From 088f40b7b027dad6519712ff224a5798dd62a204 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:27 +0000 Subject: genirq: Generic chip: Add linear irq domain support Provide infrastructure for irq chip implementations which work on linear irq domains. - Interface to allocate multiple generic chips which are associated to the irq domain. - Interface to get the generic chip pointer for a particular hardware interrupt in the domain. - irq domain mapping function to install the chip for a particular interrupt. Note: This lacks a removal function for now. [ Sebastian Hesselbarth: Mask cache and pointer math fixups ] Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.450634298@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 30 ++++++++ include/linux/irqdomain.h | 12 +++ kernel/irq/generic-chip.c | 187 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/irq/irqdomain.c | 6 -- 4 files changed, 223 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index ab8169faaa6..af7052c6a45 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -678,6 +678,8 @@ struct irq_chip_type { * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks + * @installed: bitfield to denote installed interrupts + * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * @@ -699,6 +701,8 @@ struct irq_chip_generic { u32 wake_active; unsigned int num_ct; void *private; + unsigned long installed; + struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[0]; }; @@ -719,6 +723,24 @@ enum irq_gc_flags { IRQ_GC_NO_MASK = 1 << 3, }; +/* + * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains + * @irqs_per_chip: Number of interrupts per chip + * @num_chips: Number of chips + * @irq_flags_to_set: IRQ* flags to set on irq setup + * @irq_flags_to_clear: IRQ* flags to clear on irq setup + * @gc_flags: Generic chip specific setup flags + * @gc: Array of pointers to generic interrupt chips + */ +struct irq_domain_chip_generic { + unsigned int irqs_per_chip; + unsigned int num_chips; + unsigned int irq_flags_to_clear; + unsigned int irq_flags_to_set; + enum irq_gc_flags gc_flags; + struct irq_chip_generic *gc[0]; +}; + /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); @@ -742,6 +764,14 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); +struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); +int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags flags); + + static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 0d5b17bf5e5..ba2c708adcf 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -66,6 +66,10 @@ struct irq_domain_ops { unsigned long *out_hwirq, unsigned int *out_type); }; +extern struct irq_domain_ops irq_generic_chip_ops; + +struct irq_domain_chip_generic; + /** * struct irq_domain - Hardware interrupt number translation object * @link: Element in global irq_domain list. @@ -109,8 +113,16 @@ struct irq_domain { /* Optional device node pointer */ struct device_node *of_node; + /* Optional pointer to generic interrupt chips */ + struct irq_domain_chip_generic *gc; }; +#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. + * ie. legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ + #ifdef CONFIG_IRQ_DOMAIN struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 3deb3333d53..8743d62fded 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -244,12 +245,156 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } } +/** + * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * @d: irq domain for which to allocate chips + * @irqs_per_chip: Number of interrupts each chip handles + * @num_ct: Number of irq_chip_type instances associated with this + * @name: Name of the irq chip + * @handler: Default flow handler associated with these chips + * @clr: IRQ_* bits to clear in the mapping function + * @set: IRQ_* bits to set in the mapping function + */ +int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) +{ + struct irq_domain_chip_generic *dgc; + struct irq_chip_generic *gc; + int numchips, sz, i; + unsigned long flags; + void *tmp; + + if (d->gc) + return -EBUSY; + + if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) + return -EINVAL; + + numchips = d->revmap_data.linear.size / irqs_per_chip; + if (!numchips) + return -EINVAL; + + /* Allocate a pointer, generic chip and chiptypes for each chip */ + sz = sizeof(*dgc) + numchips * sizeof(gc); + sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + + tmp = dgc = kzalloc(sz, GFP_KERNEL); + if (!dgc) + return -ENOMEM; + dgc->irqs_per_chip = irqs_per_chip; + dgc->num_chips = numchips; + dgc->irq_flags_to_set = set; + dgc->irq_flags_to_clear = clr; + dgc->gc_flags = gcflags; + d->gc = dgc; + + /* Calc pointer to the first generic chip */ + tmp += sizeof(*dgc) + numchips * sizeof(gc); + for (i = 0; i < numchips; i++) { + /* Store the pointer to the generic chip */ + dgc->gc[i] = gc = tmp; + irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, + NULL, handler); + gc->domain = d; + raw_spin_lock_irqsave(&gc_lock, flags); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock_irqrestore(&gc_lock, flags); + /* Calc pointer to the next generic chip */ + tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + } + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); + +/** + * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq + * @d: irq domain pointer + * @hw_irq: Hardware interrupt number + */ +struct irq_chip_generic * +irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return NULL; + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return NULL; + return dgc->gc[idx]; +} +EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); + /* * Separate lockdep class for interrupt chip which can nest irq_desc * lock. */ static struct lock_class_key irq_nested_lock_class; +/** + * irq_map_generic_chip - Map a generic chip for an irq domain + */ +static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) +{ + struct irq_data *data = irq_get_irq_data(virq); + struct irq_domain_chip_generic *dgc = d->gc; + struct irq_chip_generic *gc; + struct irq_chip_type *ct; + struct irq_chip *chip; + unsigned long flags; + int idx; + + if (!d->gc) + return -ENODEV; + + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return -EINVAL; + gc = dgc->gc[idx]; + + idx = hw_irq % dgc->irqs_per_chip; + + if (test_bit(idx, &gc->installed)) + return -EBUSY; + + ct = gc->chip_types; + chip = &ct->chip; + + /* We only init the cache for the first mapping of a generic chip */ + if (!gc->installed) { + raw_spin_lock_irqsave(&gc->lock, flags); + irq_gc_init_mask_cache(gc, dgc->gc_flags); + raw_spin_unlock_irqrestore(&gc->lock, flags); + } + + /* Mark the interrupt as installed */ + set_bit(idx, &gc->installed); + + if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(virq, &irq_nested_lock_class); + + if (chip->irq_calc_mask) + chip->irq_calc_mask(data); + else + data->mask = 1 << idx; + + irq_set_chip_and_handler(virq, chip, ct->handler); + irq_set_chip_data(virq, gc); + irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); + return 0; +} + +struct irq_domain_ops irq_generic_chip_ops = { + .map = irq_map_generic_chip, + .xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_generic_chip_ops); + /** * irq_setup_generic_chip - Setup a range of interrupts with a generic chip * @gc: Generic irq chip holding all data @@ -354,6 +499,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, } EXPORT_SYMBOL_GPL(irq_remove_generic_chip); +static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) +{ + unsigned int virq; + + if (!gc->domain) + return irq_get_irq_data(gc->irq_base); + + /* + * We don't know which of the irqs has been actually + * installed. Use the first one. + */ + if (!gc->installed) + return NULL; + + virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed)); + return virq ? irq_get_irq_data(virq) : NULL; +} + #ifdef CONFIG_PM static int irq_gc_suspend(void) { @@ -362,8 +525,12 @@ static int irq_gc_suspend(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_suspend) - ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_suspend) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_suspend(data); + } } return 0; } @@ -375,8 +542,12 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_resume) - ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_resume) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_resume(data); + } } } #else @@ -391,8 +562,12 @@ static void irq_gc_shutdown(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_pm_shutdown) - ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_pm_shutdown) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_pm_shutdown(data); + } } } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0..1db9e70f548 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -16,12 +16,6 @@ #include #include -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. - * ie. legacy 8259, gets irqs 1..15 */ -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ - static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -- cgit v1.2.3-70-g09d2 From e8bd834f73714378ef110a64287db1b77033c8da Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 29 May 2013 03:10:52 +0100 Subject: genirq: irqchip: Add mask to block out invalid irqs Some controllers have irqs that aren't wired up and must never be used. For the generic chip attached to an irq_domain this provides a mask that can be used to block out particular irqs so that they never get mapped. Signed-off-by: Grant Likely Link: http://lkml.kernel.org/r/1369793454-19197-2-git-send-email-grant.likely@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 ++ kernel/irq/generic-chip.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index af7052c6a45..298a9b9ce67 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -679,6 +679,7 @@ struct irq_chip_type { * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks * @installed: bitfield to denote installed interrupts + * @unused: bitfield to denote unused interrupts * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types @@ -702,6 +703,7 @@ struct irq_chip_generic { unsigned int num_ct; void *private; unsigned long installed; + unsigned long unused; struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[0]; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 8743d62fded..95575d8d539 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -359,6 +359,9 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, idx = hw_irq % dgc->irqs_per_chip; + if (test_bit(idx, &gc->unused)) + return -ENOTSUPP; + if (test_bit(idx, &gc->installed)) return -EBUSY; -- cgit v1.2.3-70-g09d2 From d671a605580d2caafc77f1a25bcf8435795df6fe Mon Sep 17 00:00:00 2001 From: Andreas Fenkart Date: Fri, 10 May 2013 12:21:30 +0200 Subject: genirq: Add kerneldoc for irq_disable. Document the lazy disable functionality. comment based on changelog of d209a699a0b975ad Signed-off-by: Andreas Fenkart Cc: balbi@ti.com Link: http://lkml.kernel.org/r/1368181290-1583-1-git-send-email-andreas.fenkart@streamunlimited.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cbd97ce0b00..a3bb14fbe5c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +/** + * irq_disable - Mark interupt disabled + * @desc: irq descriptor which should be disabled + * + * If the chip does not implement the irq_disable callback, we + * use a lazy disable approach. That means we mark the interrupt + * disabled, but leave the hardware unmasked. That's an + * optimization because we avoid the hardware access for the + * common case where no interrupt happens after we marked it + * disabled. If an interrupt happens, then the interrupt flow + * handler masks the line at the hardware level and marks it + * pending. + */ void irq_disable(struct irq_desc *desc) { irq_state_set_disabled(desc); -- cgit v1.2.3-70-g09d2 From 6cffe00f7d4e24679eae6b7aae4caaf915288256 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Wed, 15 May 2013 14:38:11 -0700 Subject: alarmtimer: Add functions for timerfd support Add functions needed for hooking up alarmtimer to timerfd: * alarm_restart: Similar to hrtimer_restart, restart an alarmtimer after the expires time has already been updated (as with alarm_forward). * alarm_forward_now: Similar to hrtimer_forward_now, move the expires time forward to an interval from the current time of the associated clock. * alarm_start_relative: Start an alarmtimer with an expires time relative to the current time of the associated clock. * alarm_expires_remaining: Similar to hrtimer_expires_remaining, return the amount of time remaining until alarm expiry. Signed-off-by: Todd Poynor Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 4 ++++ kernel/time/alarmtimer.c | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 9069694e70e..a899402a5a0 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -44,10 +44,14 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); int alarm_start(struct alarm *alarm, ktime_t start); +int alarm_start_relative(struct alarm *alarm, ktime_t start); +void alarm_restart(struct alarm *alarm); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval); +ktime_t alarm_expires_remaining(const struct alarm *alarm); /* Provide way to access the rtc device being used by alarmtimers */ struct rtc_device *alarmtimer_get_rtcdev(void); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b1294..3e5cba27447 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,12 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} + #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback @@ -305,7 +311,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, } /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ @@ -324,6 +330,31 @@ int alarm_start(struct alarm *alarm, ktime_t start) return ret; } +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} + /** * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled @@ -394,6 +425,12 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) return overrun; } +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} -- cgit v1.2.3-70-g09d2 From 5c83545f24ab3dd67e0ae0e2b795fea750f08c34 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Tue, 21 May 2013 22:32:14 -0700 Subject: power: Add option to log time spent in suspend Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Statistices on the time spent in suspend are kept in /sys/kernel/debug/sleep_time. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.markovic@linaro.org: Re-formatted suspend time table to better fit expected values. Moved accounting of suspend time into timekeeping core. Removed CONFIG_SUSPEND_TIME flag and made the feature conditional on CONFIG_DEBUG_FS. Changed the file name to sleep_time to better fit terminology in timekeeping core. Changed seq_printf to seq_puts. Tweaked commit message] Signed-off-by: Zoran Markovic Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/timekeeping.c | 2 ++ kernel/time/timekeeping_debug.c | 72 ++++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping_internal.h | 14 ++++++++ 4 files changed, 89 insertions(+) create mode 100644 kernel/time/timekeeping_debug.c create mode 100644 kernel/time/timekeeping_internal.h (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab50..d52ac8bf000 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 933efa4071c..838fc0777b6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,7 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -851,6 +852,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 00000000000..802433a4f5e --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 00000000000..13323ea08ff --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3-70-g09d2 From 0de358f1c2642710d41190b73fbc295e675c4ab8 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 30 May 2013 14:34:20 +0530 Subject: sched/fair: Remove unused variable from expire_cfs_rq_runtime() Commit 78becc2709 ("sched: Use an accessor to read the rq clock") introduces rq_clock(), which obsoletes the use of the "rq" variable in expire_cfs_rq_runtime() and triggers this build warning: kernel/sched/fair.c: In function 'expire_cfs_rq_runtime': kernel/sched/fair.c:2159:13: warning: unused variable 'rq' [-Wunused-variable] Signed-off-by: Kamalesh Babulal Acked-by: Frederic Weisbecker Acked-by: Paul Turner Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1369904660-14169-1-git-send-email-kamalesh@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ee1c2e4ae6..143dcdbc47a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2156,7 +2156,6 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) -- cgit v1.2.3-70-g09d2 From cd38ca854de15b26eb91009137cbe157d8a8e773 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 3 Jun 2013 18:20:29 +0000 Subject: PM / Hibernate: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. Commit 69f1d475cc did this for a similar printk in this file, but I must have missed this one. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807..7872a35eafe 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } /* -- cgit v1.2.3-70-g09d2 From 40b313608ad4ea655addd2ec6cdd106477ae8e15 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 21 May 2013 13:49:35 +1000 Subject: Finally eradicate CONFIG_HOTPLUG Ever since commit 45f035ab9b8f ("CONFIG_HOTPLUG should be always on"), it has been basically impossible to build a kernel with CONFIG_HOTPLUG turned off. Remove all the remaining references to it. Cc: Russell King Cc: Doug Thompson Cc: Bjorn Helgaas Cc: Steven Whitehouse Cc: Arnd Bergmann Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Andrew Morton Signed-off-by: Stephen Rothwell Acked-by: Mauro Carvalho Chehab Acked-by: Hans Verkuil Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-bus-pci | 5 +---- Documentation/SubmitChecklist | 2 +- Documentation/cpu-hotplug.txt | 2 +- Documentation/hwmon/submitting-patches | 3 +-- Documentation/kbuild/kconfig.txt | 2 +- Documentation/usb/hotplug.txt | 6 +++--- arch/arm/Kconfig | 2 +- arch/arm/kernel/module.c | 8 -------- arch/arm/kernel/vmlinux.lds.S | 4 ---- arch/arm/mach-ixp4xx/Kconfig | 1 - arch/blackfin/Kconfig | 2 +- arch/cris/arch-v32/drivers/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/mips/Kconfig | 2 +- arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 2 +- arch/powerpc/mm/tlb_hash64.c | 4 ++-- arch/s390/Kconfig | 1 - arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 1 - arch/x86/Kconfig | 2 +- drivers/base/Kconfig | 2 -- drivers/char/pcmcia/Kconfig | 2 +- drivers/edac/Kconfig | 2 +- drivers/pci/Kconfig | 2 -- drivers/pci/hotplug/Kconfig | 2 +- drivers/pcmcia/Kconfig | 1 - drivers/staging/media/go7007/go7007.txt | 1 - fs/gfs2/Kconfig | 5 ++--- include/asm-generic/vmlinux.lds.h | 20 -------------------- init/Kconfig | 3 --- kernel/power/Kconfig | 1 - mm/Kconfig | 2 +- 33 files changed, 22 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 1ce5ae329c0..5210a51c90f 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -64,7 +64,6 @@ Description: Writing a non-zero value to this attribute will force a rescan of all PCI buses in the system, and re-discover previously removed devices. - Depends on CONFIG_HOTPLUG. What: /sys/bus/pci/devices/.../msi_irqs/ Date: September, 2011 @@ -90,7 +89,6 @@ Contact: Linux PCI developers Description: Writing a non-zero value to this attribute will hot-remove the PCI device and any of its children. - Depends on CONFIG_HOTPLUG. What: /sys/bus/pci/devices/.../pci_bus/.../rescan Date: May 2011 @@ -99,7 +97,7 @@ Description: Writing a non-zero value to this attribute will force a rescan of the bus and all child buses, and re-discover devices removed earlier from this - part of the device tree. Depends on CONFIG_HOTPLUG. + part of the device tree. What: /sys/bus/pci/devices/.../rescan Date: January 2009 @@ -109,7 +107,6 @@ Description: force a rescan of the device's parent bus and all child buses, and re-discover devices removed earlier from this part of the device tree. - Depends on CONFIG_HOTPLUG. What: /sys/bus/pci/devices/.../reset Date: July 2009 diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index dc0e33210d7..2b7e32dfe00 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist @@ -105,5 +105,5 @@ kernel patches. same time, just various/random combinations of them]: CONFIG_SMP, CONFIG_SYSFS, CONFIG_PROC_FS, CONFIG_INPUT, CONFIG_PCI, - CONFIG_BLOCK, CONFIG_PM, CONFIG_HOTPLUG, CONFIG_MAGIC_SYSRQ, + CONFIG_BLOCK, CONFIG_PM, CONFIG_MAGIC_SYSRQ, CONFIG_NET, CONFIG_INET=n (but latter with CONFIG_NET=y) diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index 9f401350f50..0efd1b905b9 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -128,7 +128,7 @@ A: When doing make defconfig, Enable CPU hotplug support "Processor type and Features" -> Support for Hotpluggable CPUs -Make sure that you have CONFIG_HOTPLUG, and CONFIG_SMP turned on as well. +Make sure that you have CONFIG_SMP turned on as well. You would need to enable CONFIG_HOTPLUG_CPU for SMP suspend/resume support as well. diff --git a/Documentation/hwmon/submitting-patches b/Documentation/hwmon/submitting-patches index 843751c41fe..46286460462 100644 --- a/Documentation/hwmon/submitting-patches +++ b/Documentation/hwmon/submitting-patches @@ -27,8 +27,7 @@ increase the chances of your change being accepted. explicitly below the patch header. * If your patch (or the driver) is affected by configuration options such as - CONFIG_SMP or CONFIG_HOTPLUG, make sure it compiles for all configuration - variants. + CONFIG_SMP, make sure it compiles for all configuration variants. 2. Adding functionality to existing drivers diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt index 3f429ed8b3b..213859e69e8 100644 --- a/Documentation/kbuild/kconfig.txt +++ b/Documentation/kbuild/kconfig.txt @@ -165,7 +165,7 @@ Searching in menuconfig: Example: /hotplug This lists all config symbols that contain "hotplug", - e.g., HOTPLUG, HOTPLUG_CPU, MEMORY_HOTPLUG. + e.g., HOTPLUG_CPU, MEMORY_HOTPLUG. For search help, enter / followed TAB-TAB-TAB (to highlight ) and Enter. This will tell you that you can also use diff --git a/Documentation/usb/hotplug.txt b/Documentation/usb/hotplug.txt index 4c945716a66..6424b130485 100644 --- a/Documentation/usb/hotplug.txt +++ b/Documentation/usb/hotplug.txt @@ -33,9 +33,9 @@ you get the best hotplugging when you configure a highly modular system. KERNEL HOTPLUG HELPER (/sbin/hotplug) -When you compile with CONFIG_HOTPLUG, you get a new kernel parameter: -/proc/sys/kernel/hotplug, which normally holds the pathname "/sbin/hotplug". -That parameter names a program which the kernel may invoke at various times. +There is a kernel parameter: /proc/sys/kernel/hotplug, which normally +holds the pathname "/sbin/hotplug". That parameter names a program +which the kernel may invoke at various times. The /sbin/hotplug program can be invoked by any subsystem as part of its reaction to a configuration change, from a thread in that subsystem. diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 49d993cee51..365e79f4fbf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1528,7 +1528,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" - depends on SMP && HOTPLUG + depends on SMP help Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 1e9be5d25e5..85c3fb6c93c 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -288,24 +288,16 @@ int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs, if (strcmp(".ARM.exidx.init.text", secname) == 0) maps[ARM_SEC_INIT].unw_sec = s; - else if (strcmp(".ARM.exidx.devinit.text", secname) == 0) - maps[ARM_SEC_DEVINIT].unw_sec = s; else if (strcmp(".ARM.exidx", secname) == 0) maps[ARM_SEC_CORE].unw_sec = s; else if (strcmp(".ARM.exidx.exit.text", secname) == 0) maps[ARM_SEC_EXIT].unw_sec = s; - else if (strcmp(".ARM.exidx.devexit.text", secname) == 0) - maps[ARM_SEC_DEVEXIT].unw_sec = s; else if (strcmp(".init.text", secname) == 0) maps[ARM_SEC_INIT].txt_sec = s; - else if (strcmp(".devinit.text", secname) == 0) - maps[ARM_SEC_DEVINIT].txt_sec = s; else if (strcmp(".text", secname) == 0) maps[ARM_SEC_CORE].txt_sec = s; else if (strcmp(".exit.text", secname) == 0) maps[ARM_SEC_EXIT].txt_sec = s; - else if (strcmp(".devexit.text", secname) == 0) - maps[ARM_SEC_DEVEXIT].txt_sec = s; } for (i = 0; i < ARM_SEC_MAX; i++) diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index a871b8e00fc..fa25e4e425f 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -70,10 +70,6 @@ SECTIONS ARM_EXIT_DISCARD(EXIT_TEXT) ARM_EXIT_DISCARD(EXIT_DATA) EXIT_CALL -#ifndef CONFIG_HOTPLUG - *(.ARM.exidx.devexit.text) - *(.ARM.extab.devexit.text) -#endif #ifndef CONFIG_MMU *(.fixup) *(__ex_table) diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig index 73a2d905af8..30e1ebe3a89 100644 --- a/arch/arm/mach-ixp4xx/Kconfig +++ b/arch/arm/mach-ixp4xx/Kconfig @@ -235,7 +235,6 @@ config IXP4XX_QMGR config IXP4XX_NPE tristate "IXP4xx Network Processor Engine support" select FW_LOADER - select HOTPLUG help This driver supports IXP4xx built-in network coprocessors and is automatically selected by Ethernet and HSS drivers. diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index a117652b5fe..b573827d041 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -253,7 +253,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" - depends on SMP && HOTPLUG + depends on SMP default y config BF_REV_MIN diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig index c55971a40c3..ab725edbc68 100644 --- a/arch/cris/arch-v32/drivers/Kconfig +++ b/arch/cris/arch-v32/drivers/Kconfig @@ -617,7 +617,6 @@ config ETRAX_PV_CHANGEABLE_BITS config ETRAX_CARDBUS bool "Cardbus support" depends on ETRAX_ARCH_V32 - select HOTPLUG help Enabled the ETRAX Cardbus driver. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 1a2b7749b04..5a768ad8e89 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -376,7 +376,6 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP - select HOTPLUG default n ---help--- Say Y here to experiment with turning CPUs off and on. CPUs diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 7a58ab933b2..e433b90507f 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -962,7 +962,7 @@ config SYS_HAS_EARLY_PRINTK config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" - depends on SMP && HOTPLUG && SYS_SUPPORTS_HOTPLUG_CPU + depends on SMP && SYS_SUPPORTS_HOTPLUG_CPU help Say Y here to allow turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 6507dabdd5d..2a2aea5aae5 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -254,7 +254,6 @@ config IRQSTACKS config HOTPLUG_CPU bool default y if SMP - select HOTPLUG config ARCH_SELECT_MEMORY_MODEL def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c33e3ad2c8f..508e3fe934d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -341,7 +341,7 @@ config SWIOTLB config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" - depends on SMP && HOTPLUG && (PPC_PSERIES || \ + depends on SMP && (PPC_PSERIES || \ PPC_PMAC || PPC_POWERNV || (PPC_85xx && !PPC_E500MC)) ---help--- Say Y here to be able to disable and re-enable individual diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 023ec8a13f3..7df1c5edda8 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -183,8 +183,8 @@ void tlb_flush(struct mmu_gather *tlb) * since 64K pages may overlap with other bridges when using 64K pages * with 4K HW pages on IO space. * - * Because of that usage pattern, it's only available with CONFIG_HOTPLUG - * and is implemented for small size rather than speed. + * Because of that usage pattern, it is implemented for small size rather + * than speed. */ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index da183c5a103..22f75b504f7 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -301,7 +301,6 @@ config HOTPLUG_CPU def_bool y prompt "Support for hot-pluggable CPUs" depends on SMP - select HOTPLUG help Say Y here to be able to turn CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu/cpu#. diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 8c868cf2cf9..1020dd85431 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -748,7 +748,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG + depends on SMP help Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 9ac9f166633..a00cbd356db 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -243,7 +243,6 @@ config SECCOMP config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SPARC64 && SMP - select HOTPLUG help Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu/cpu#. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 685692c94f0..ae917f3965f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1725,7 +1725,7 @@ config PHYSICAL_ALIGN config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" - depends on SMP && HOTPLUG + depends on SMP ---help--- Say Y here to allow turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 07abd9d76f7..5daa2599ed4 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -2,7 +2,6 @@ menu "Generic Driver Options" config UEVENT_HELPER_PATH string "path to uevent helper" - depends on HOTPLUG default "" help Path to uevent helper program forked by the kernel for @@ -23,7 +22,6 @@ config UEVENT_HELPER_PATH config DEVTMPFS bool "Maintain a devtmpfs filesystem to mount at /dev" - depends on HOTPLUG help This creates a tmpfs/ramfs filesystem instance early at bootup. In this filesystem, the kernel driver core maintains device diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig index 2a166d56738..b27f5342fe7 100644 --- a/drivers/char/pcmcia/Kconfig +++ b/drivers/char/pcmcia/Kconfig @@ -3,7 +3,7 @@ # menu "PCMCIA character devices" - depends on HOTPLUG && PCMCIA!=n + depends on PCMCIA!=n config SYNCLINK_CS tristate "SyncLink PC Card support" diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index e443f2c1dfd..a697a64d538 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -145,7 +145,7 @@ config EDAC_E7XXX config EDAC_E752X tristate "Intel e752x (e7520, e7525, e7320) and 3100" - depends on EDAC_MM_EDAC && PCI && X86 && HOTPLUG + depends on EDAC_MM_EDAC && PCI && X86 help Support for error detection and correction on the Intel E7520, E7525, E7320 server chipsets. diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 6d51aa68ec7..77497f140d6 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -55,7 +55,6 @@ config PCI_STUB config XEN_PCIDEV_FRONTEND tristate "Xen PCI Frontend" depends on PCI && X86 && XEN - select HOTPLUG select PCI_XEN select XEN_XENBUS_FRONTEND default y @@ -113,7 +112,6 @@ config PCI_IOAPIC tristate "PCI IO-APIC hotplug support" if X86 depends on PCI depends on ACPI - depends on HOTPLUG default !X86 config PCI_LABEL diff --git a/drivers/pci/hotplug/Kconfig b/drivers/pci/hotplug/Kconfig index 9fcb87f353d..bb7ebb22db0 100644 --- a/drivers/pci/hotplug/Kconfig +++ b/drivers/pci/hotplug/Kconfig @@ -4,7 +4,7 @@ menuconfig HOTPLUG_PCI tristate "Support for PCI Hotplug" - depends on PCI && HOTPLUG && SYSFS + depends on PCI && SYSFS ---help--- Say Y here if you have a motherboard with a PCI Hotplug controller. This allows you to add and remove PCI cards while the machine is diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig index b90f85bf5f8..1c6362491bd 100644 --- a/drivers/pcmcia/Kconfig +++ b/drivers/pcmcia/Kconfig @@ -4,7 +4,6 @@ menuconfig PCCARD tristate "PCCard (PCMCIA/CardBus) support" - depends on HOTPLUG ---help--- Say Y here if you want to attach PCMCIA- or PC-cards to your Linux computer. These are credit-card size devices such as network cards, diff --git a/drivers/staging/media/go7007/go7007.txt b/drivers/staging/media/go7007/go7007.txt index fcb3e235abb..dc0026cff9f 100644 --- a/drivers/staging/media/go7007/go7007.txt +++ b/drivers/staging/media/go7007/go7007.txt @@ -78,7 +78,6 @@ All vendor-built kernels should already be configured properly. However, for custom-built kernels, the following options need to be enabled in the kernel as built-in or modules: - CONFIG_HOTPLUG - Support for hot-pluggable devices CONFIG_MODULES - Enable loadable module support CONFIG_KMOD - Automatic kernel module loading CONFIG_FW_LOADER - Hotplug firmware loading support diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 5a376ab81fe..90c6a8faaec 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -20,13 +20,12 @@ config GFS2_FS be found here: http://sources.redhat.com/cluster The "nolock" lock module is now built in to GFS2 by default. If - you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 - networking. + you want to use the DLM, be sure to enable IPv4/6 networking. config GFS2_FS_LOCKING_DLM bool "GFS2 DLM locking" depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ - HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) + CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) help Multiple node locking module for GFS2 diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index eb58d2d7d97..4f2737208c4 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -68,14 +68,6 @@ * are handled as text/data or they can be discarded (which * often happens at runtime) */ -#ifdef CONFIG_HOTPLUG -#define DEV_KEEP(sec) *(.dev##sec) -#define DEV_DISCARD(sec) -#else -#define DEV_KEEP(sec) -#define DEV_DISCARD(sec) *(.dev##sec) -#endif - #ifdef CONFIG_HOTPLUG_CPU #define CPU_KEEP(sec) *(.cpu##sec) #define CPU_DISCARD(sec) @@ -182,8 +174,6 @@ *(.data) \ *(.ref.data) \ *(.data..shared_aligned) /* percpu related */ \ - DEV_KEEP(init.data) \ - DEV_KEEP(exit.data) \ CPU_KEEP(init.data) \ CPU_KEEP(exit.data) \ MEM_KEEP(init.data) \ @@ -372,8 +362,6 @@ /* __*init sections */ \ __init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) { \ *(.ref.rodata) \ - DEV_KEEP(init.rodata) \ - DEV_KEEP(exit.rodata) \ CPU_KEEP(init.rodata) \ CPU_KEEP(exit.rodata) \ MEM_KEEP(init.rodata) \ @@ -416,8 +404,6 @@ *(.text.hot) \ *(.text) \ *(.ref.text) \ - DEV_KEEP(init.text) \ - DEV_KEEP(exit.text) \ CPU_KEEP(init.text) \ CPU_KEEP(exit.text) \ MEM_KEEP(init.text) \ @@ -503,7 +489,6 @@ /* init and exit section handling */ #define INIT_DATA \ *(.init.data) \ - DEV_DISCARD(init.data) \ CPU_DISCARD(init.data) \ MEM_DISCARD(init.data) \ KERNEL_CTORS() \ @@ -511,7 +496,6 @@ *(.init.rodata) \ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ - DEV_DISCARD(init.rodata) \ CPU_DISCARD(init.rodata) \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ @@ -521,14 +505,11 @@ #define INIT_TEXT \ *(.init.text) \ - DEV_DISCARD(init.text) \ CPU_DISCARD(init.text) \ MEM_DISCARD(init.text) #define EXIT_DATA \ *(.exit.data) \ - DEV_DISCARD(exit.data) \ - DEV_DISCARD(exit.rodata) \ CPU_DISCARD(exit.data) \ CPU_DISCARD(exit.rodata) \ MEM_DISCARD(exit.data) \ @@ -536,7 +517,6 @@ #define EXIT_TEXT \ *(.exit.text) \ - DEV_DISCARD(exit.text) \ CPU_DISCARD(exit.text) \ MEM_DISCARD(exit.text) diff --git a/init/Kconfig b/init/Kconfig index 9d3a7887a6d..a5e0917165f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1244,9 +1244,6 @@ config SYSCTL_ARCH_UNALIGN_ALLOW the unaligned access emulation. see arch/parisc/kernel/unaligned.c for reference -config HOTPLUG - def_bool y - config HAVE_PCSPKR_PLATFORM bool diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180..9c39de095ba 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP - select HOTPLUG select HOTPLUG_CPU config PM_AUTOSLEEP diff --git a/mm/Kconfig b/mm/Kconfig index e742d06285b..f5e698e30d4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -173,7 +173,7 @@ config HAVE_BOOTMEM_INFO_NODE config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE -- cgit v1.2.3-70-g09d2 From f12dc020149fad7087e119e54cffea668272bf7d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:13:02 -0700 Subject: cgroup: mark "tasks" cgroup file as insane Some resources controlled by cgroup aren't per-task and cgroup core allowing threads of a single thread_group to be in different cgroups forced memcg do explicitly find the group leader and use it. This is gonna be nasty when transitioning to unified hierarchy and in general we don't want and won't support granularity finer than processes. Mark "tasks" with CFTYPE_INSANE. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: cgroups@vger.kernel.org Cc: Vivek Goyal --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fefc41c1a14..1e0f445b5b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4037,6 +4037,7 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, static struct cftype files[] = { { .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, .release = cgroup_pidlist_release, -- cgit v1.2.3-70-g09d2 From cc5943a7816ba6c00639837a62131386619548dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:13:55 -0700 Subject: cgroup: mark "notify_on_release" and "release_agent" cgroup files insane The empty cgroup notification mechanism currently implemented in cgroup is tragically outdated. Forking and execing userland process stopped being a viable notification mechanism more than a decade ago. We're gonna have a saner mechanism. Let's make it clear that this abomination is going away. Mark "notify_on_release" and "release_agent" with CFTYPE_INSANE. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1e0f445b5b8..b3bb8a39364 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4052,6 +4052,7 @@ static struct cftype files[] = { }, { .name = "notify_on_release", + .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, @@ -4073,7 +4074,7 @@ static struct cftype files[] = { }, { .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, -- cgit v1.2.3-70-g09d2 From d5c56ced775f6bdc32b689b01c9c4f9b66e18610 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:14:34 -0700 Subject: cgroup: clean up the cftype array for the base cgroup files * Rename it from files[] (really?) to cgroup_base_files[]. * Drop CGROUP_FILE_GENERIC_PREFIX which was defined as "cgroup." and used inconsistently. Just use "cgroup." directly. * Collect insane files at the end. Note that only the insane ones are missing "cgroup." prefix. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b3bb8a39364..bc53d5014b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4029,35 +4029,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, return 0; } -/* - * for the common functions, 'private' gives the type of file - */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." -static struct cftype files[] = { +static struct cftype cgroup_base_files[] = { { - .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, - .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, - .mode = S_IRUGO | S_IWUSR, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .name = "cgroup.procs", .open = cgroup_procs_open, .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "notify_on_release", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .name = "cgroup.event_control", .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, @@ -4072,6 +4053,26 @@ static struct cftype files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_sane_behavior_show, }, + + /* + * Historical crazy stuff. These don't have "cgroup." prefix and + * don't exist if sane_behavior. If you're depending on these, be + * prepared to be burned. + */ + { + .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ + .open = cgroup_tasks_open, + .write_u64 = cgroup_tasks_write, + .release = cgroup_pidlist_release, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "notify_on_release", + .flags = CFTYPE_INSANE, + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, @@ -4095,7 +4096,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, struct cgroup_subsys *ss; if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, files, true); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err < 0) return err; } -- cgit v1.2.3-70-g09d2 From 06d6b3cbdf94bc37732df83e7c25774370411a56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:11 +0800 Subject: cpuset: remove redundant check in cpuset_cpus_allowed_fallback() task_cs() will never return NULL. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe..f0c884a0e57 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2253,8 +2253,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs = task_cs(tsk); - if (cs) - do_set_cpus_allowed(tsk, cs->cpus_allowed); + do_set_cpus_allowed(tsk, cs->cpus_allowed); rcu_read_unlock(); /* -- cgit v1.2.3-70-g09d2 From 40df2deb50570b288b7067b111af0aa9ca640e6f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:23 +0800 Subject: cpuset: cleanup guarantee_online_{cpus|mems}() - We never pass a NULL @cs to these functions. - The top cpuset always has some online cpus/mems. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0c884a0e57..d753837cca3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -304,53 +304,38 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_mask. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_mask. + * until we find one that does have some online cpus. The top + * cpuset always has some cpus online. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_mutex held. */ - static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { - while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); - if (cs) - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); - else - cpumask_copy(pmask, cpu_online_mask); - BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some - * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_MEMORY]. + * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_mutex held. */ - static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { - while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_MEMORY])) + while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); - if (cs) - nodes_and(*pmask, cs->mems_allowed, - node_states[N_MEMORY]); - else - *pmask = node_states[N_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); + nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); } /* -- cgit v1.2.3-70-g09d2 From 67bd2c59850de20d0ecdc8084cbbfe34e53b6804 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:35 +0800 Subject: cpuset: remove unnecessary variable in cpuset_attach() We can just use oldcs->mems_allowed. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d753837cca3..dbef832e5e2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1407,8 +1407,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { - /* static bufs protected by cpuset_mutex */ - static nodemask_t cpuset_attach_nodemask_from; + /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; @@ -1442,13 +1441,12 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &cpuset_attach_nodemask_to); mmput(mm); } -- cgit v1.2.3-70-g09d2 From 249cc86db7492dc8de1d2eddebc6bcc4ab2a8e9e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:48 +0800 Subject: cpuset: remove cpuset_test_cpumask() The test is done in set_cpus_allowed_ptr(), so it's redundant. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dbef832e5e2..51f8e1d5a2a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -783,23 +783,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Call with cpuset_mutex held. May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). - */ -static int cpuset_test_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - return !cpumask_equal(&tsk->cpus_allowed, - (cgroup_cs(scan->cg))->cpus_allowed); -} - /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test @@ -835,7 +818,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) struct cgroup_scanner scan; scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; + scan.test_task = NULL; scan.process_task = cpuset_change_cpumask; scan.heap = heap; cgroup_scan_tasks(&scan); -- cgit v1.2.3-70-g09d2 From a73456f37b9dbc917398387d0cba926b4455b70f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:59 +0800 Subject: cpuset: re-structure update_cpumask() a bit Check if cpus_allowed is to be changed before calling validate_change(). This won't change any behavior, but later it will allow us to do this: # mkdir /cpuset/child # echo $$ > /cpuset/child/tasks /* empty cpuset */ # echo > /cpuset/child/cpuset.cpus /* do nothing, won't fail */ Without this patch, the last operation will fail. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51f8e1d5a2a..535dce685ee 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -856,14 +856,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; + retval = validate_change(cs, trialcs); + if (retval < 0) + return retval; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; -- cgit v1.2.3-70-g09d2 From c5a130325f13b219438cb100e2da71a3e31199f3 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Thu, 6 Jun 2013 15:20:51 -0700 Subject: ACPI/APEI: Add parameter check before error injection When param1 is enabled in EINJ but not assigned with a valid value, sometimes it will cause the error like below: APEI: Can not request [mem 0x7aaa7000-0x7aaa7007] for APEI EINJ Trigger registers It is because some firmware will access target address specified in param1 to trigger the error when injecting memory error. This will cause resource conflict with regular memory. So It must be removed from trigger table resources, but incorrect param1/param2 combination will stop this action. Add extra check to avoid this kind of error. Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- drivers/acpi/apei/einj.c | 38 +++++++++++++++++++++++++++++++++++--- kernel/resource.c | 1 + 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index 2cc8e034a3c..fb57d03e698 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "apei-internal.h" @@ -41,6 +42,10 @@ #define SPIN_UNIT 100 /* 100ns */ /* Firmware should respond within 1 milliseconds */ #define FIRMWARE_TIMEOUT (1 * NSEC_PER_MSEC) +#define ACPI5_VENDOR_BIT BIT(31) +#define MEM_ERROR_MASK (ACPI_EINJ_MEMORY_CORRECTABLE | \ + ACPI_EINJ_MEMORY_UNCORRECTABLE | \ + ACPI_EINJ_MEMORY_FATAL) /* * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action. @@ -367,7 +372,7 @@ static int __einj_error_trigger(u64 trigger_paddr, u32 type, * This will cause resource conflict with regular memory. So * remove it from trigger table resources. */ - if ((param_extension || acpi5) && (type & 0x0038) && param2) { + if ((param_extension || acpi5) && (type & MEM_ERROR_MASK) && param2) { struct apei_resources addr_resources; apei_resources_init(&addr_resources); trigger_param_region = einj_get_trigger_parameter_region( @@ -427,7 +432,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2) struct set_error_type_with_address *v5param = einj_param; v5param->type = type; - if (type & 0x80000000) { + if (type & ACPI5_VENDOR_BIT) { switch (vendor_flags) { case SETWA_FLAGS_APICID: v5param->apicid = param1; @@ -512,7 +517,34 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2) static int einj_error_inject(u32 type, u64 param1, u64 param2) { int rc; + unsigned long pfn; + /* + * We need extra sanity checks for memory errors. + * Other types leap directly to injection. + */ + + /* ensure param1/param2 existed */ + if (!(param_extension || acpi5)) + goto inject; + + /* ensure injection is memory related */ + if (type & ACPI5_VENDOR_BIT) { + if (vendor_flags != SETWA_FLAGS_MEM) + goto inject; + } else if (!(type & MEM_ERROR_MASK)) + goto inject; + + /* + * Disallow crazy address masks that give BIOS leeway to pick + * injection address almost anywhere. Insist on page or + * better granularity and that target address is normal RAM. + */ + pfn = PFN_DOWN(param1 & param2); + if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK)) + return -EINVAL; + +inject: mutex_lock(&einj_mutex); rc = __einj_error_inject(type, param1, param2); mutex_unlock(&einj_mutex); @@ -590,7 +622,7 @@ static int error_type_set(void *data, u64 val) * Vendor defined types have 0x80000000 bit set, and * are not enumerated by ACPI_EINJ_GET_ERROR_TYPE */ - vendor = val & 0x80000000; + vendor = val & ACPI5_VENDOR_BIT; tval = val & 0x7fffffff; /* Only one error type can be specified */ diff --git a/kernel/resource.c b/kernel/resource.c index d7386986e10..77bf11a86c7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn) { return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +EXPORT_SYMBOL_GPL(page_is_ram); void __weak arch_remove_reservations(struct resource *avail) { -- cgit v1.2.3-70-g09d2 From e44193d39e8d4d1de5d996fcd37ed75e5c704f10 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:14:22 +0800 Subject: cpuset: let hotplug propagation work wait for task attaching Instead of triggering propagation work in cpuset_attach(), we make hotplug propagation work wait until there's no task attaching in progress. IMO this is more robust. We won't see empty masks in cpuset_attach(). Also it's a preparation for removing propagation work. Without asynchronous propagation we can't call move_tasks_in_empty_cpuset() in cpuset_attach(), because otherwise we'll deadlock on cgroup_mutex. tj: typo fixes. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 535dce685ee..e902473f76b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -59,6 +59,7 @@ #include #include #include +#include /* * Tracks how many cpusets are currently defined in system. @@ -275,6 +276,8 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -1436,14 +1439,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } cs->attach_in_progress--; - - /* - * We may have raced with CPU/memory hotunplug. Trigger hotplug - * propagation if @cs doesn't have any CPU or memory. It will move - * the newly added tasks to the nearest parent which can execute. - */ - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - schedule_cpuset_propagate_hotplug(cs); + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } @@ -1555,10 +1552,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. - * - * Flushing cpuset_hotplug_work is enough to synchronize against - * hotplug hanlding; however, cpuset_attach() may schedule - * propagation work directly. Flush the workqueue too. */ flush_work(&cpuset_hotplug_work); flush_workqueue(cpuset_propagate_hotplug_wq); @@ -2005,8 +1998,20 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; +retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); + mutex_lock(&cpuset_mutex); + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { + mutex_unlock(&cpuset_mutex); + goto retry; + } + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); -- cgit v1.2.3-70-g09d2 From 388afd8549dc8be0920e00ae9404341593b6bd7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:14:47 +0800 Subject: cpuset: remove async hotplug propagation work As we can drop rcu read lock while iterating cgroup hierarchy, we don't have to do propagation asynchronously via workqueue. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 69 +++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e902473f76b..608fe1308b2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -101,8 +101,6 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - - struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -268,12 +266,7 @@ static DEFINE_MUTEX(callback_mutex); /* * CPU / memory hotplug is handled asynchronously. */ -static struct workqueue_struct *cpuset_propagate_hotplug_wq; - static void cpuset_hotplug_workfn(struct work_struct *work); -static void cpuset_propagate_hotplug_workfn(struct work_struct *work); -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); - static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -1554,7 +1547,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * after execution capability is restored. */ flush_work(&cpuset_hotplug_work); - flush_workqueue(cpuset_propagate_hotplug_wq); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -1821,7 +1813,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); - INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; return &cs->css; @@ -1984,18 +1975,17 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /** - * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ -static void cpuset_propagate_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug_update_tasks(struct cpuset *cs) { static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; - struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; retry: @@ -2044,34 +2034,6 @@ retry: */ if (is_empty) remove_tasks_in_empty_cpuset(cs); - - /* the following may free @cs, should be the last operation */ - css_put(&cs->css); -} - -/** - * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset - * @cs: cpuset of interest - * - * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and - * memory masks according to top_cpuset. - */ -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) -{ - /* - * Pin @cs. The refcnt will be released when the work item - * finishes executing. - */ - if (!css_tryget(&cs->css)) - return; - - /* - * Queue @cs->hotplug_work. If already pending, lose the css ref. - * cpuset_propagate_hotplug_wq is ordered and propagation will - * happen in the order this function is called. - */ - if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) - css_put(&cs->css); } /** @@ -2084,8 +2046,8 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory - * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all - * descendants. + * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on + * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. @@ -2128,21 +2090,26 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); } + mutex_unlock(&cpuset_mutex); + /* if cpus or mems went down, we need to propagate to descendants */ if (cpus_offlined || mems_offlined) { struct cpuset *cs; struct cgroup *pos_cgrp; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) - schedule_cpuset_propagate_hotplug(cs); - rcu_read_unlock(); - } + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { + if (!css_tryget(&cs->css)) + continue; + rcu_read_unlock(); - mutex_unlock(&cpuset_mutex); + cpuset_hotplug_update_tasks(cs); - /* wait for propagations to finish */ - flush_workqueue(cpuset_propagate_hotplug_wq); + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); + } /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated) @@ -2193,10 +2160,6 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); - - cpuset_propagate_hotplug_wq = - alloc_ordered_workqueue("cpuset_hotplug", 0); - BUG_ON(!cpuset_propagate_hotplug_wq); } /** -- cgit v1.2.3-70-g09d2 From 5e1cda5b8ae93f5f02e8c5a30390ac9b4d2c20e6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 29 May 2013 03:10:53 +0100 Subject: irqdomain: Relax failure path on setting up mappings Commit 98aa468e, "irqdomain: Support for static IRQ mapping and association" introduced an API for directly associating blocks of hwirqs to linux irqs. However, if any irq in that block failed to map (say if the mapping functions returns an error because the irq is already mapped) then the whole thing will fail and roll back. This is probably too aggressive since there are valid reasons why a mapping may fail. ie. Firmware may have a particular IRQ marked as unusable. This patch drops the error path out of irq_domain_associate(). If a mapping fails, then it is simply skipped. There is no reason to fail the entire allocation. v2: Still output an information message on failed mappings and make sure attempted mapping gets cleared out of the irq_data structure. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner --- kernel/irq/irqdomain.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 20b677dd0b2..61d6d3c80fe 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -464,23 +464,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not - * be mapped. - * - * Since on some platforms we blindly try to map everything - * we end up with a log full of backtraces. - * - * So instead, we silently fail on -EPERM, it is the - * responsibility of the PIC driver to display a relevant - * message if needed. + * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + of_node_full_name(domain->of_node), hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; - goto err_unmap; + continue; } } -- cgit v1.2.3-70-g09d2 From 9bbf877d3b6b8c5991000296f40a3f0fe66fa89b Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 12:10:24 +0100 Subject: irqdomain: Replace LEGACY mapping with LINEAR The LEGACY mapping unnecessarily complicates the irqdomain code and can easily be implemented with a linear mapping. By ripping it out and replacing it with the LINEAR mapping the object size of irqdomain.c shrinks by about 330 bytes (ARMv7) which offsets the additional allocation required by the linear map. It also makes it possible for current LEGACY map users to pre-allocate irq_descs for a subset of the hwirqs and dynamically allocate the rest as needed. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- include/linux/irqdomain.h | 7 ---- kernel/irq/irqdomain.c | 84 ++++------------------------------------------- 2 files changed, 6 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index ba2c708adcf..6f062416e5b 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -93,11 +93,6 @@ struct irq_domain { /* type of reverse mapping_technique */ unsigned int revmap_type; union { - struct { - unsigned int size; - unsigned int first_irq; - irq_hw_number_t first_hwirq; - } legacy; struct { unsigned int size; unsigned int *revmap; @@ -117,8 +112,6 @@ struct irq_domain { struct irq_domain_chip_generic *gc; }; -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. - * ie. legacy 8259, gets irqs 1..15 */ #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 61d6d3c80fe..1ac8cf41b9a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -82,13 +82,6 @@ void irq_domain_remove(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* - * Legacy domains don't manage their own irq_desc - * allocations, we expect the caller to handle irq_desc - * freeing on their own. - */ - break; case IRQ_DOMAIN_MAP_TREE: /* * radix_tree_delete() takes care of destroying the root @@ -122,17 +115,6 @@ void irq_domain_remove(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove); -static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; - int size = domain->revmap_data.legacy.size; - - if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) - return 0; - return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; -} - /** * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. @@ -213,57 +195,17 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int i; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", + first_irq, first_irq + size - 1, + (int)first_hwirq, (int)first_hwirq + size -1); + + domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); if (!domain) return NULL; - domain->revmap_data.legacy.first_irq = first_irq; - domain->revmap_data.legacy.first_hwirq = first_hwirq; - domain->revmap_data.legacy.size = size; - - mutex_lock(&irq_domain_mutex); - /* Verify that all the irqs are available */ - for (i = 0; i < size; i++) { - int irq = first_irq + i; - struct irq_data *irq_data = irq_get_irq_data(irq); - - if (WARN_ON(!irq_data || irq_data->domain)) { - mutex_unlock(&irq_domain_mutex); - irq_domain_free(domain); - return NULL; - } - } + WARN_ON(irq_domain_associate_many(domain, first_irq, first_hwirq, size)); - /* Claim all of the irqs before registering a legacy domain */ - for (i = 0; i < size; i++) { - struct irq_data *irq_data = irq_get_irq_data(first_irq + i); - irq_data->hwirq = first_hwirq + i; - irq_data->domain = domain; - } - mutex_unlock(&irq_domain_mutex); - - for (i = 0; i < size; i++) { - int irq = first_irq + i; - int hwirq = first_hwirq + i; - - /* IRQ0 gets ignored */ - if (!irq) - continue; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - if (ops->map) - ops->map(domain, irq, hwirq); - - /* Clear norequest flags */ - irq_clear_status_flags(irq, IRQ_NOREQUEST); - } - - irq_domain_add(domain); return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); @@ -492,10 +434,6 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, } return 0; - - err_unmap: - irq_domain_disassociate_many(domain, irq_base, i); - return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -575,10 +513,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } - /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return irq_domain_legacy_revmap(domain, hwirq); - /* Allocate a virtual interrupt number */ hint = hwirq % nr_irqs; if (hint == 0) @@ -706,10 +640,6 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - /* Never unmap legacy interrupts */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return; - irq_domain_disassociate_many(domain, virq, 1); irq_free_desc(virq); } @@ -732,8 +662,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - return irq_domain_legacy_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_TREE: -- cgit v1.2.3-70-g09d2 From 0bb4afb45dd1add73ca643a865daa38716aeff0c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 14:23:30 +0100 Subject: irqdomain: Add a name field This patch adds a name field to the irq_domain structure to help mere mortals understand the mappings between irq domains and virqs. It also converts a number of places that have open-coded some kind of fudging an irqdomain name to use the new field. This means a more consistent display of names in irq domain log messages and debugfs output. Signed-off-by: Grant Likely --- include/linux/irqdomain.h | 1 + kernel/irq/generic-chip.c | 1 + kernel/irq/irqdomain.c | 19 ++++++------------- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 6f062416e5b..e5e513c2d10 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -89,6 +89,7 @@ struct irq_domain_chip_generic; */ struct irq_domain { struct list_head link; + const char *name; /* type of reverse mapping_technique */ unsigned int revmap_type; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 95575d8d539..ca98cc5d630 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -305,6 +305,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } + d->name = name; return 0; } EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ac8cf41b9a..b1b5e6793fd 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -410,12 +410,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", - of_node_full_name(domain->of_node), hwirq, virq, ret); + domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; continue; } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; } switch (domain->revmap_type) { @@ -708,8 +711,6 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - const char *p; - static const char none[] = "none"; void *data; int i; @@ -731,20 +732,12 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); chip = irq_desc_get_chip(desc); - if (chip && chip->name) - p = chip->name; - else - p = none; - seq_printf(m, "%-15s ", p); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain) - p = of_node_full_name(desc->irq_data.domain->of_node); - else - p = none; - seq_printf(m, "%s\n", p); + seq_printf(m, "%s\n", desc->irq_data.domain->name); } raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3-70-g09d2 From cef5075c8c238ffd04c86a77a5a9bdbd18031137 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 11 Jul 2012 17:24:31 +0100 Subject: irqdomain: merge linear and tree reverse mappings. Keeping them separate makes irq_domain more complex and adds a lot of code (as proven by the diffstat). Merging them simplifies the whole scheme. This change makes it so both the tree and linear methods can be used by the same irq_domain instance. If the hwirq is less than the ->linear_size, then the linear map is used to reverse map the hwirq. Otherwise the radix tree is used. The test for which map to use is no more expensive that the existing code, so the performance of fast path is preserved. It also means that complex interrupt controllers can use both the linear map and a tree in the same domain. This may be useful for an interrupt controller with a base set of core irqs and a large number of GPIOs which might be used as irqs. The linear map could cover the core irqs, and the tree used for thas irqs. The linear map could cover the core irqs, and the tree used for the gpios. v2: Drop reorganization of revmap data Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- include/linux/irqdomain.h | 18 ++++---- kernel/irq/irqdomain.c | 107 +++++++++++++--------------------------------- 2 files changed, 39 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e5e513c2d10..1cbb7413c12 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -75,7 +75,6 @@ struct irq_domain_chip_generic; * @link: Element in global irq_domain list. * @revmap_type: Method used for reverse mapping hwirq numbers to linux irq. This * will be one of the IRQ_DOMAIN_MAP_* values. - * @revmap_data: Revmap method specific data. * @ops: pointer to irq_domain methods * @host_data: private data pointer for use by owner. Not touched by irq_domain * core code. @@ -93,10 +92,9 @@ struct irq_domain { /* type of reverse mapping_technique */ unsigned int revmap_type; - union { + struct { struct { unsigned int size; - unsigned int *revmap; } linear; struct { unsigned int max_irq; @@ -111,11 +109,13 @@ struct irq_domain { struct device_node *of_node; /* Optional pointer to generic interrupt chips */ struct irq_domain_chip_generic *gc; + + /* Linear reverse map */ + unsigned int linear_revmap[]; }; #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ #ifdef CONFIG_IRQ_DOMAIN struct irq_domain *irq_domain_add_simple(struct device_node *of_node, @@ -137,10 +137,6 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, unsigned int max_irq, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data); - extern struct irq_domain *irq_find_host(struct device_node *node); extern void irq_set_default_host(struct irq_domain *host); @@ -152,6 +148,12 @@ static inline struct irq_domain *irq_domain_add_legacy_isa( return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops, host_data); } +static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + return irq_domain_add_linear(of_node, 0, ops, host_data); +} extern void irq_domain_remove(struct irq_domain *host); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b1b5e6793fd..5a1d8ec8509 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -34,22 +34,24 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, + unsigned int revmap_type, int size, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; - domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, - of_node_to_nid(of_node)); + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), + GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; /* Fill structure */ + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); domain->revmap_type = revmap_type; domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->revmap_data.linear.size = size; return domain; } @@ -81,22 +83,12 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_TREE: - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_data.tree.height); - break; - case IRQ_DOMAIN_MAP_LINEAR: - kfree(domain->revmap_data.linear.revmap); - domain->revmap_data.linear.size = 0; - break; - case IRQ_DOMAIN_MAP_NOMAP: - break; - } + /* + * radix_tree_delete() takes care of destroying the root + * node when all entries are removed. Shout if there are + * any mappings left. + */ + WARN_ON(domain->revmap_data.tree.height); list_del(&domain->link); @@ -223,20 +215,11 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int *revmap; - revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, - of_node_to_nid(of_node)); - if (WARN_ON(!revmap)) + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + if (!domain) return NULL; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); - if (!domain) { - kfree(revmap); - return NULL; - } - domain->revmap_data.linear.size = size; - domain->revmap_data.linear.revmap = revmap; irq_domain_add(domain); return domain; } @@ -248,7 +231,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); if (domain) { domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); @@ -257,28 +240,6 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_nomap); -/** - * irq_domain_add_tree() - * @of_node: pointer to interrupt controller's device tree node. - * @ops: map/unmap domain callbacks - * - * Note: The radix tree will be allocated later during boot automatically - * (the reverse mapping will use the slow path until that happens). - */ -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_TREE, ops, host_data); - if (domain) { - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_tree); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller @@ -359,17 +320,13 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->domain = NULL; irq_data->hwirq = 0; - /* Clear reverse map */ - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = 0; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_delete(&domain->revmap_data.tree, hwirq); mutex_unlock(&revmap_trees_mutex); - break; } } } @@ -421,16 +378,12 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = virq; - break; - case IRQ_DOMAIN_MAP_TREE: + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = virq; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); - break; } irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -667,13 +620,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, switch (domain->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_TREE: - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - if (data) - return data->irq; - break; case IRQ_DOMAIN_MAP_NOMAP: data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) @@ -696,13 +642,18 @@ EXPORT_SYMBOL_GPL(irq_find_mapping); unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct irq_data *data; BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) - return 0; + if (hwirq >= domain->revmap_data.linear.size) { + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; + } - return domain->revmap_data.linear.revmap[hwirq]; + return domain->linear_revmap[hwirq]; } EXPORT_SYMBOL_GPL(irq_linear_revmap); -- cgit v1.2.3-70-g09d2 From 1aa0dd94ca07df818cf14588c9031ab1d7fd84d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:03:59 +0100 Subject: irqdomain: Eliminate revmap type The NOMAP irq_domain type is only used by a handful of interrupt controllers and it unnecessarily complicates the code by adding special cases on how to look up mappings and different revmap functions are used for each type which need to validate the correct type is passed to it before performing the reverse map. Eliminating the revmap_type and making a single reverse mapping function simplifies the code. It also shouldn't be any slower than having separate revmap functions because the type of the revmap needed to be checked anyway. The linear and tree revmap types were already merged in a previous patch. This patch rolls the NOMAP or direct mapping behaviour into the same domain code making is possible for an irq domain to do any mapping type; linear, tree or direct; and that the mapping will be transparent to the interrupt controller driver. With this change, direct mappings will get stored in the linear or tree mapping for consistency. Reverse mapping from the hwirq to virq will go through the normal lookup process. However, any controller using a direct mapping can take advantage of knowing that hwirq==virq for any mapped interrupts skip doing a revmap lookup when handling IRQs. Signed-off-by: Grant Likely --- include/linux/irqdomain.h | 48 +++++++++++++++++------------------------ kernel/irq/generic-chip.c | 5 +---- kernel/irq/irqdomain.c | 55 +++++++++++++++++++---------------------------- 3 files changed, 43 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 1cbb7413c12..51ef84a3c99 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -73,50 +73,42 @@ struct irq_domain_chip_generic; /** * struct irq_domain - Hardware interrupt number translation object * @link: Element in global irq_domain list. - * @revmap_type: Method used for reverse mapping hwirq numbers to linux irq. This - * will be one of the IRQ_DOMAIN_MAP_* values. + * @name: Name of interrupt domain * @ops: pointer to irq_domain methods * @host_data: private data pointer for use by owner. Not touched by irq_domain * core code. - * @irq_base: Start of irq_desc range assigned to the irq_domain. The creator - * of the irq_domain is responsible for allocating the array of - * irq_desc structures. - * @nr_irq: Number of irqs managed by the irq domain - * @hwirq_base: Starting number for hwirqs managed by the irq domain - * @of_node: (optional) Pointer to device tree nodes associated with the - * irq_domain. Used when decoding device tree interrupt specifiers. + * + * Optional elements + * @of_node: Pointer to device tree nodes associated with the irq_domain. Used + * when decoding device tree interrupt specifiers. + * @gc: Pointer to a list of generic chips. There is a helper function for + * setting up one or more generic chips for interrupt controllers + * drivers using the generic chip library which uses this pointer. + * + * Revmap data, used internally by irq_domain + * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that + * support direct mapping + * @revmap_size: Size of the linear map table @linear_revmap[] + * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map + * @linear_revmap: Linear table of hwirq->virq reverse mappings */ struct irq_domain { struct list_head link; const char *name; - - /* type of reverse mapping_technique */ - unsigned int revmap_type; - struct { - struct { - unsigned int size; - } linear; - struct { - unsigned int max_irq; - } nomap; - struct radix_tree_root tree; - } revmap_data; const struct irq_domain_ops *ops; void *host_data; - irq_hw_number_t inval_irq; - /* Optional device node pointer */ + /* Optional data */ struct device_node *of_node; - /* Optional pointer to generic interrupt chips */ struct irq_domain_chip_generic *gc; - /* Linear reverse map */ + /* reverse map data. The linear map gets appended to the irq_domain */ + unsigned int revmap_direct_max_irq; + unsigned int revmap_size; + struct radix_tree_root revmap_tree; unsigned int linear_revmap[]; }; -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ - #ifdef CONFIG_IRQ_DOMAIN struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ca98cc5d630..4b011064e14 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -270,10 +270,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, if (d->gc) return -EBUSY; - if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) - return -EINVAL; - - numchips = d->revmap_data.linear.size / irqs_per_chip; + numchips = d->revmap_size / irqs_per_chip; if (!numchips) return -EINVAL; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a1d8ec8509..c38be78fceb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,7 +25,6 @@ static struct irq_domain *irq_default_domain; /** * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -34,7 +33,7 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, int size, + int size, const struct irq_domain_ops *ops, void *host_data) { @@ -46,12 +45,11 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, return NULL; /* Fill structure */ - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - domain->revmap_type = revmap_type; + INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); - domain->revmap_data.linear.size = size; + domain->revmap_size = size; return domain; } @@ -67,8 +65,7 @@ static void irq_domain_add(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - pr_debug("Allocated domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Added domain %s\n", domain->name); } /** @@ -88,7 +85,7 @@ void irq_domain_remove(struct irq_domain *domain) * node when all entries are removed. Shout if there are * any mappings left. */ - WARN_ON(domain->revmap_data.tree.height); + WARN_ON(domain->revmap_tree.height); list_del(&domain->link); @@ -100,8 +97,7 @@ void irq_domain_remove(struct irq_domain *domain) mutex_unlock(&irq_domain_mutex); - pr_debug("Removed domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } @@ -216,7 +212,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, { struct irq_domain *domain; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + domain = irq_domain_alloc(of_node, size, ops, host_data); if (!domain) return NULL; @@ -230,10 +226,9 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); + struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); if (domain) { - domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; + domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); } return domain; @@ -321,11 +316,11 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->hwirq = 0; /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); + radix_tree_delete(&domain->revmap_tree, hwirq); mutex_unlock(&revmap_trees_mutex); } } @@ -378,11 +373,11 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); } @@ -399,7 +394,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use - * the linux irq as the hardware interrupt number. + * the linux irq as the hardware interrupt number. It still uses the linear + * or radix tree to store the mapping, but the irq controller can optimize + * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { @@ -408,17 +405,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) - return 0; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } - if (virq >= domain->revmap_data.nomap.max_irq) { + if (virq >= domain->revmap_direct_max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - domain->revmap_data.nomap.max_irq); + domain->revmap_direct_max_irq); irq_free_desc(virq); return 0; } @@ -617,17 +611,13 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_NOMAP: + if (hwirq < domain->revmap_direct_max_irq) { data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) return hwirq; - break; } - return 0; + return irq_linear_revmap(domain, hwirq); } EXPORT_SYMBOL_GPL(irq_find_mapping); @@ -643,12 +633,11 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_data *data; - BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (hwirq >= domain->revmap_data.linear.size) { + if (hwirq >= domain->revmap_size) { rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); rcu_read_unlock(); return data ? data->irq : 0; } -- cgit v1.2.3-70-g09d2 From fa40f377577752b83252b9d2b3165d4acee0eb7c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:57:40 +0100 Subject: irqdomain: Clean up aftermath of irq_domain refactoring After refactoring the irqdomain code, there are a number of API functions that are merely empty wrappers around core code. Drop those wrappers out of the C file and replace them with static inlines in the header. Signed-off-by: Grant Likely --- arch/powerpc/platforms/cell/beat_interrupt.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 2 +- include/linux/irqdomain.h | 31 +++++-- kernel/irq/irqdomain.c | 127 ++++++++------------------- 4 files changed, 62 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c index 8c6dc42ecf6..9e5dfbcc00a 100644 --- a/arch/powerpc/platforms/cell/beat_interrupt.c +++ b/arch/powerpc/platforms/cell/beat_interrupt.c @@ -239,7 +239,7 @@ void __init beatic_init_IRQ(void) ppc_md.get_irq = beatic_get_irq; /* Allocate an irq host */ - beatic_host = irq_domain_add_nomap(NULL, 0, &beatic_pic_host_ops, NULL); + beatic_host = irq_domain_add_nomap(NULL, ~0, &beatic_pic_host_ops, NULL); BUG_ON(beatic_host == NULL); irq_set_default_host(beatic_host); } diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index bdb738a69e4..f921067d924 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -192,7 +192,7 @@ static int psurge_secondary_ipi_init(void) { int rc = -ENOMEM; - psurge_host = irq_domain_add_nomap(NULL, 0, &psurge_host_ops, NULL); + psurge_host = irq_domain_add_nomap(NULL, ~0, &psurge_host_ops, NULL); if (psurge_host) psurge_secondary_virq = irq_create_direct_mapping(psurge_host); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 51ef84a3c99..fd4b26f8f44 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -110,6 +110,10 @@ struct irq_domain { }; #ifdef CONFIG_IRQ_DOMAIN +struct irq_domain *__irq_domain_add(struct device_node *of_node, + int size, int direct_max, + const struct irq_domain_ops *ops, + void *host_data); struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, unsigned int first_irq, @@ -121,17 +125,30 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, +extern struct irq_domain *irq_find_host(struct device_node *node); +extern void irq_set_default_host(struct irq_domain *host); + +/** + * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: Number of interrupts in the domain. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node, unsigned int size, const struct irq_domain_ops *ops, - void *host_data); -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + void *host_data) +{ + return __irq_domain_add(of_node, size, 0, ops, host_data); +} +static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, unsigned int max_irq, const struct irq_domain_ops *ops, - void *host_data); -extern struct irq_domain *irq_find_host(struct device_node *node); -extern void irq_set_default_host(struct irq_domain *host); - + void *host_data) +{ + return __irq_domain_add(of_node, 0, max_irq, ops, host_data); +} static inline struct irq_domain *irq_domain_add_legacy_isa( struct device_node *of_node, const struct irq_domain_ops *ops, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c38be78fceb..e0db59e2eef 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; /** - * irq_domain_alloc() - Allocate a new irq_domain data structure + * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -32,10 +35,10 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - int size, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *__irq_domain_add(struct device_node *of_node, + int size, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; @@ -50,23 +53,16 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, domain->host_data = host_data; domain->of_node = of_node_get(of_node); domain->revmap_size = size; + domain->revmap_direct_max_irq = direct_max; - return domain; -} - -static void irq_domain_free(struct irq_domain *domain) -{ - of_node_put(domain->of_node); - kfree(domain); -} - -static void irq_domain_add(struct irq_domain *domain) -{ mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); + pr_debug("Added domain %s\n", domain->name); + return domain; } +EXPORT_SYMBOL_GPL(__irq_domain_add); /** * irq_domain_remove() - Remove an irq domain. @@ -99,30 +95,28 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - irq_domain_free(domain); + of_node_put(domain->of_node); + kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); /** - * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, - * pass zero to assign irqs on-the-fly. This will result in a - * linear IRQ domain so it is important to use irq_create_mapping() - * for each used IRQ, especially when SPARSE_IRQ is enabled. + * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then + * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. For the legacy domain, IRQ descriptors will also - * be allocated. + * Allocates an irq_domain, and optionally if first_irq is positive then also + * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most - * interrupt controllers which is that a linear mapping should - * normally be used unless the system requires a legacy mapping in - * order to support supplying interrupt numbers during non-DT - * registration of devices. + * interrupt controllers. If device tree is used, then first_irq will be 0 and + * irqs get mapped dynamically on the fly. However, if the controller requires + * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, @@ -130,33 +124,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) { - int irq_base; + struct irq_domain *domain; + + domain = __irq_domain_add(of_node, size, 0, ops, host_data); + if (!domain) + return NULL; + if (first_irq > 0) { if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* - * Set the descriptor allocator to search for a - * 1-to-1 mapping, such as irq_alloc_desc_at(). - * Use of_node_to_nid() which is defined to - * numa_node_id() on platforms that have no custom - * implementation. - */ - irq_base = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); - if (irq_base < 0) { + /* attempt to allocated irq_descs */ + int rc = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); - irq_base = first_irq; - } - } else - irq_base = first_irq; - - return irq_domain_add_legacy(of_node, size, irq_base, 0, - ops, host_data); + } + WARN_ON(irq_domain_associate_many(domain, first_irq, 0, size)); } - /* A linear domain is the default */ - return irq_domain_add_linear(of_node, size, ops, host_data); + return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -184,11 +170,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", - first_irq, first_irq + size - 1, - (int)first_hwirq, (int)first_hwirq + size -1); - - domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; @@ -198,43 +180,6 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); -/** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain; - - domain = irq_domain_alloc(of_node, size, ops, host_data); - if (!domain) - return NULL; - - irq_domain_add(domain); - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_linear); - -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - unsigned int max_irq, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); - if (domain) { - domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_nomap); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller -- cgit v1.2.3-70-g09d2 From 1400ea86025a22862f97e7fe544433751b43ecec Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 22:20:44 +0100 Subject: irqdomain: Beef up debugfs output This patch increases the amount of output produced by the irq_domain_mapping debugfs file by first listing all of the registered irq domains at the beginning of the output, and then by including all mapped IRQs in the output, not just the active ones. It is very useful when debugging irqdomain issues to be able to see the entire list of mapped irqs, not just the ones that happen to be connected to devices. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e0db59e2eef..280b8047d8d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -596,12 +596,29 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - void *data; + struct irq_domain *domain; + struct radix_tree_iter iter; + void *data, **slot; int i; - seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", + "name", "mapped", "linear-max", "direct-max", "devtree-node"); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, link) { + int count = 0; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) + count++; + seq_printf(m, "%c%-16s %6u %10u %10u %s\n", + domain == irq_default_domain ? '*' : ' ', domain->name, + domain->revmap_size + count, domain->revmap_size, + domain->revmap_direct_max_irq, + domain->of_node ? of_node_full_name(domain->of_node) : ""); + } + mutex_unlock(&irq_domain_mutex); + + seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "domain name"); + "active", "type", "domain"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -609,12 +626,15 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); + domain = desc->irq_data.domain; - if (desc->action && desc->action->handler) { + if (domain) { struct irq_chip *chip; + int hwirq = desc->irq_data.hwirq; + bool direct; seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + seq_printf(m, "0x%05x ", hwirq); chip = irq_desc_get_chip(desc); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); @@ -622,6 +642,11 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); seq_printf(m, "%s\n", desc->irq_data.domain->name); } -- cgit v1.2.3-70-g09d2 From d7f3e207397d7b4868e33d3f88396a06f4d5a8c7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2013 16:19:52 -0700 Subject: rcu: Convert rcutree.c printk calls This commit converts printk() calls to the corresponding pr_*() calls. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 35380019f0f..1009c0ccd4b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { @@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) smp_processor_id(), (long)(jiffies - rsp->gp_start), rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) - printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) rcu_dump_cpu_stacks(rsp); @@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); -- cgit v1.2.3-70-g09d2 From 6eaef633d77f50f031dd355ff5f91aaa1aaf9885 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 10:08:37 -0700 Subject: rcu: Move code to apply callback-numbering simplifications The addition of callback numbering allows combining the detection of the ends of old grace periods and the beginnings of new grace periods. This commit moves code to set the stage for this combining. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 118 +++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1009c0ccd4b..c36e52dc091 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -984,65 +984,6 @@ void rcu_cpu_stall_reset(void) rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } -/* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - if (rdp->gpnum != rnp->gpnum) { - /* - * If the current grace period is waiting for this CPU, - * set up to detect a quiescent state, otherwise don't - * go looking for one. - */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - rdp->passed_quiesce = 0; - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); - zero_cpu_stall_ticks(rdp); - } -} - -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __note_new_gpnum(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - /* * Initialize the specified rcu_data structure's callback list to empty. */ @@ -1359,6 +1300,45 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat } } +/* + * Update CPU-local rcu_data state to record the newly noticed grace period. + * This is used both when we started the grace period and when we notice + * that someone else started the grace period. The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + * and must have irqs disabled. + */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + if (rdp->gpnum != rnp->gpnum) { + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ + rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + zero_cpu_stall_ticks(rdp); + } +} + +static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + local_irq_restore(flags); + return; + } + __note_new_gpnum(rsp, rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + /* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp @@ -1381,6 +1361,26 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +/* + * Did someone else start a new RCU grace period start since we last + * checked? Update local state appropriately if so. Must be called + * on the CPU corresponding to rdp. + */ +static int +check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + if (rdp->gpnum != rsp->gpnum) { + note_new_gpnum(rsp, rdp); + ret = 1; + } + local_irq_restore(flags); + return ret; +} + /* * Do per-CPU grace-period initialization for running CPU. The caller * must hold the lock of the leaf rcu_node structure corresponding to -- cgit v1.2.3-70-g09d2 From 398ebe6000c16135d12ce2ff64318f306ffb20b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 10:53:14 -0700 Subject: rcu: Make __note_new_gpnum() check for ends of prior grace periods The current implementation can detect the beginning of a new grace period before noting the end of a previous grace period. Although the current implementation correctly handles this sort of nonsense, it would be good to reduce RCU's state space by making such nonsense unnecessary, which is now possible thanks to the fact that RCU's callback groups are now numbered. This commit therefore makes __note_new_gpnum() invoke __rcu_process_gp_end() in order to note the ends of prior grace periods before noting the beginnings of new grace periods. Of course, this now means that note_new_gpnum() notes both the beginnings and ends of grace periods, and could therefore be used in place of rcu_process_gp_end(). But that is a job for later commits. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c36e52dc091..54aba759b60 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1309,6 +1309,9 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat */ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + /* Handle the ends of any preceding grace periods first. */ + __rcu_process_gp_end(rsp, rnp, rdp); + if (rdp->gpnum != rnp->gpnum) { /* * If the current grace period is waiting for this CPU, -- cgit v1.2.3-70-g09d2 From d34ea3221a0f34ed42eadabf054604bbcc7ecd27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 11:10:43 -0700 Subject: rcu: Rename note_new_gpnum() to note_gp_changes() Because note_new_gpnum() now also checks for the ends of old grace periods, this commit changes its name to note_gp_changes(). Later commits will merge rcu_process_gp_end() into note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 54aba759b60..7eb2bc95300 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1307,7 +1307,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * ->lock of the leaf rcu_node structure corresponding to the current CPU, * and must have irqs disabled. */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* Handle the ends of any preceding grace periods first. */ __rcu_process_gp_end(rsp, rnp, rdp); @@ -1326,19 +1326,20 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct } } -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_node *rnp; local_irq_save(flags); rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && + rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - __note_new_gpnum(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1377,7 +1378,7 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); + note_gp_changes(rsp, rdp); ret = 1; } local_irq_restore(flags); @@ -1396,7 +1397,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat __rcu_process_gp_end(rsp, rnp, rdp); /* Set state so that this CPU will detect the next quiescent state. */ - __note_new_gpnum(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); } /* -- cgit v1.2.3-70-g09d2 From efc151c33b971148894789dc7c5589dec46d4348 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2013 16:24:11 -0700 Subject: rcu: Convert rcutree_plugin.h printk calls This commit converts printk() calls to the corresponding pr_*() calls. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3db5a375d8d..207844ea022 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5]; static void __init rcu_bootup_announce_oddness(void) { #ifdef CONFIG_RCU_TRACE - printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU debugfs-based tracing is enabled.\n"); #endif #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", CONFIG_RCU_FANOUT); #endif #ifdef CONFIG_RCU_FANOUT_EXACT - printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); #endif #ifdef CONFIG_RCU_FAST_NO_HZ - printk(KERN_INFO - "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); #endif #ifdef CONFIG_PROVE_RCU - printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); + pr_info("\tRCU lockdep checking is enabled.\n"); #endif #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); + pr_info("\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); + pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); #endif #if defined(CONFIG_RCU_CPU_STALL_INFO) - printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); + pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); + pr_info("\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { @@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); + pr_info("Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) static void rcu_print_task_stall_begin(struct rcu_node *rnp) { - printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); } static void rcu_print_task_stall_end(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(KERN_CONT " P%d", t->pid); + pr_cont(" P%d", t->pid); ndetected++; } rcu_print_task_stall_end(); @@ -942,7 +941,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state; */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Hierarchical RCU implementation.\n"); + pr_info("Hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -1883,7 +1882,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) /* Initiate the stall-info list. */ static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } /* @@ -1914,7 +1913,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, @@ -1925,7 +1924,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) /* Terminate the stall-info list. */ static void print_cpu_stall_info_end(void) { - printk(KERN_ERR "\t"); + pr_err("\t"); } /* Zero ->ticks_this_gp for all flavors of RCU. */ @@ -1948,17 +1947,17 @@ static void increment_cpu_stall_ticks(void) static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT " {"); + pr_cont(" {"); } static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { - printk(KERN_CONT " %d", cpu); + pr_cont(" %d", cpu); } static void print_cpu_stall_info_end(void) { - printk(KERN_CONT "} "); + pr_cont("} "); } static void zero_cpu_stall_ticks(struct rcu_data *rdp) -- cgit v1.2.3-70-g09d2 From 470716fc043aba2fea832334e58d5cd5d82288a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 11:32:11 -0700 Subject: rcu: Switch callers from rcu_process_gp_end() to note_gp_changes() Because note_gp_changes() now incorporates rcu_process_gp_end() function, this commit switches to the former and eliminates the latter. In addition, this commit changes external calls from __rcu_process_gp_end() to __note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 31 +++---------------------------- kernel/rcutree_plugin.h | 2 +- 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7eb2bc95300..b04f134ab8b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1343,28 +1343,6 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __rcu_process_gp_end(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * Did someone else start a new RCU grace period start since we last * checked? Update local state appropriately if so. Must be called @@ -1393,9 +1371,6 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - /* Prior grace period ended, so advance callbacks for current CPU. */ - __rcu_process_gp_end(rsp, rnp, rdp); - /* Set state so that this CPU will detect the next quiescent state. */ __note_gp_changes(rsp, rnp, rdp); } @@ -1531,7 +1506,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __rcu_process_gp_end(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); @@ -2276,7 +2251,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); /* Handle the end of a grace period that some other CPU ended. */ - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -2362,7 +2337,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); check_for_new_grace_period(rsp, rdp); /* Start a new grace period if one not already started. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 207844ea022..f279148a016 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1628,7 +1628,7 @@ static bool rcu_try_advance_all_cbs(void) */ if (rdp->completed != rnp->completed && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); if (cpu_has_callbacks_ready_to_invoke(rdp)) cbs_ready = true; -- cgit v1.2.3-70-g09d2 From ba9fbe955f026780e6b27c279dba7c86dfdcb7d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 11:53:31 -0700 Subject: rcu: Merge __rcu_process_gp_end() into __note_gp_changes() This commit eliminates some duplicated code by merging __rcu_process_gp_end() into __note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 48 ++++++------------------------------------------ 1 file changed, 6 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b04f134ab8b..ac8f03c4147 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1254,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. In addition, the corresponding leaf rcu_node structure's - * ->lock must be held by the caller, with irqs disabled. + * Update CPU-local rcu_data state to record the beginnings and ends of + * grace periods. The caller must hold the ->lock of the leaf rcu_node + * structure corresponding to the current CPU, and must have irqs disabled. */ -static void -__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - /* Did another grace period end? */ + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { - /* No, so just accelerate recent callbacks. */ + /* No grace period end, so just accelerate recent callbacks. */ rcu_accelerate_cbs(rsp, rnp, rdp); } else { @@ -1276,41 +1274,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); - - /* - * If we were in an extended quiescent state, we may have - * missed some grace periods that others CPUs handled on - * our behalf. Catch up with this state to avoid noting - * spurious new grace periods. If another grace period - * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. Of course, any quiescent - * states we found for the old GP are now invalid. - */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { - rdp->gpnum = rdp->completed; - rdp->passed_quiesce = 0; - } - - /* - * If RCU does not need a quiescent state from this CPU, - * then make sure that this CPU doesn't go looking for one. - */ - if ((rnp->qsmask & rdp->grpmask) == 0) - rdp->qs_pending = 0; } -} - -/* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Handle the ends of any preceding grace periods first. */ - __rcu_process_gp_end(rsp, rnp, rdp); if (rdp->gpnum != rnp->gpnum) { /* -- cgit v1.2.3-70-g09d2 From 63274cfb94aac109fc2490a70a96b26751608e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 12:21:29 -0700 Subject: rcu: Eliminate check_for_new_grace_period() wrapper function One of the calls to check_for_new_grace_period() is now redundant due to an immediately preceding call to note_gp_changes(). Eliminating this redundant call leaves a single caller, which is simpler if inlined. This commit therefore eliminates the redundant call and inlines the body of check_for_new_grace_period() into the single remaining call site. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac8f03c4147..b73014998b4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1307,26 +1307,6 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_gp_changes(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - /* * Do per-CPU grace-period initialization for running CPU. The caller * must hold the lock of the leaf rcu_node structure corresponding to @@ -1749,8 +1729,10 @@ static void rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) { /* If there is now a new grace period, record and return. */ - if (check_for_new_grace_period(rsp, rdp)) + if (rdp->gpnum != rsp->gpnum) { + note_gp_changes(rsp, rdp); return; + } /* * Does this CPU still need to do its part for current grace period? @@ -2302,7 +2284,6 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Are we ignoring a completed grace period? */ note_gp_changes(rsp, rdp); - check_for_new_grace_period(rsp, rdp); /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { -- cgit v1.2.3-70-g09d2 From ce3d9c03d1fa079678cc8df1517011e215517cda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 12:27:50 -0700 Subject: rcu: Inline trivial wrapper function rcu_start_gp_per_cpu() Given the changes that introduce note_gp_change(), rcu_start_gp_per_cpu() is now a trivial wrapper function with only one caller. This commit therefore inlines it into its sole call site. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b73014998b4..391bd724cd7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1307,18 +1307,6 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -/* - * Do per-CPU grace-period initialization for running CPU. The caller - * must hold the lock of the leaf rcu_node structure corresponding to - * this CPU. - */ -static void -rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Set state so that this CPU will detect the next quiescent state. */ - __note_gp_changes(rsp, rnp, rdp); -} - /* * Initialize a new grace period. */ @@ -1367,7 +1355,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, -- cgit v1.2.3-70-g09d2 From 05eb552bf5ed9e7277bdc9c273ed2f4e9b7dc3e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 12:38:24 -0700 Subject: rcu: Move redundant call to note_gp_changes() into called function The __rcu_process_callbacks() invokes note_gp_changes() immediately before invoking rcu_check_quiescent_state(), which conditionally invokes that same function. This commit therefore eliminates the call to note_gp_changes() in __rcu_process_callbacks() in favor of making unconditional to call from rcu_check_quiescent_state() to note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 391bd724cd7..7a5194ef90d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1716,11 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) { - /* If there is now a new grace period, record and return. */ - if (rdp->gpnum != rsp->gpnum) { - note_gp_changes(rsp, rdp); - return; - } + /* Check for grace-period ends and beginnings. */ + note_gp_changes(rsp, rdp); /* * Does this CPU still need to do its part for current grace period? @@ -2184,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* Handle the end of a grace period that some other CPU ended. */ - note_gp_changes(rsp, rdp); - /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); -- cgit v1.2.3-70-g09d2 From 9a5739d73f9369ba1cdba3889ee4e2f87be25a46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2013 20:48:36 -0700 Subject: rcu: Remove "Experimental" flags After a release or two, features are no longer experimental. Therefore, this commit removes the "Experimental" tag from them. Reported-by: Paul Gortmaker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- init/Kconfig | 2 +- kernel/rcutree_plugin.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 2d9b83104dc..8df5116621e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -656,7 +656,7 @@ config RCU_BOOST_DELAY Accept the default if unsure. config RCU_NOCB_CPU - bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL" + bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU || TREE_PREEMPT_RCU default n help diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 207844ea022..6b3ccaae93a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -81,7 +81,7 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - pr_info("\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU @@ -91,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void) have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tExperimental no-CBs CPU 0\n"); + pr_info("\tOffload RCU callbacks from CPU 0\n"); cpumask_set_cpu(0, rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tExperimental no-CBs for all CPUs\n"); + pr_info("\tOffload RCU callbacks from all CPUs\n"); cpumask_setall(rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) - pr_info("\tExperimental polled no-CBs CPUs.\n"); + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); } #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } -- cgit v1.2.3-70-g09d2 From 026ad2835ce6202069e7aa0b11f5f1be4de34550 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Apr 2013 22:14:11 -0700 Subject: rcu: Drive quiescent-state-forcing delay from HZ Systems with HZ=100 can have slow bootup times due to the default three-jiffy delays between quiescent-state forcing attempts. This commit therefore auto-tunes the RCU_JIFFIES_TILL_FORCE_QS value based on the value of HZ. However, this would break very large systems that require more time between quiescent-state forcing attempts. This commit therefore also ups the default delay by one jiffy for each 256 CPUs that might be on the system (based off of nr_cpu_ids at runtime, -not- NR_CPUS at build time). Updated to collapse #ifdefs for RCU_JIFFIES_TILL_FORCE_QS into a step-function definition as suggested by Josh Triplett. Reported-by: Paul Mackerras Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 18 ++++++++++++++++-- kernel/rcutree.h | 15 ++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1009c0ccd4b..f344d3c824a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -218,8 +218,8 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; -static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); @@ -3265,11 +3265,25 @@ static void __init rcu_init_one(struct rcu_state *rsp, */ static void __init rcu_init_geometry(void) { + ulong d; int i; int j; int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; + /* + * Initialize any unspecified boot parameters. + * The default values of jiffies_till_first_fqs and + * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS + * value, which is a function of HZ, then adding one for each + * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. + */ + d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + if (jiffies_till_first_fqs == ULONG_MAX) + jiffies_till_first_fqs = d; + if (jiffies_till_next_fqs == ULONG_MAX) + jiffies_till_next_fqs = d; + /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4df503470e4..4a39d364493 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -343,12 +343,17 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) + /* For jiffies_till_first_fqs and */ + /* and jiffies_till_next_fqs. */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ +#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ + /* delay between bouts of */ + /* quiescent-state forcing. */ + +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ + /* at least one scheduling clock */ + /* irq before ratting on them. */ #define rcu_wait(cond) \ do { \ -- cgit v1.2.3-70-g09d2 From 4982969d965ec87b1887c86d2e0b3d81065e1d38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2013 16:56:53 -0700 Subject: rcu: Merge adjacent identical ifdefs Two ifdefs in kernel/rcupdate.c now have identical conditions with nothing between them, so the commit merges them into a single ifdef. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4..faeea984dba 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -145,9 +145,6 @@ static struct lock_class_key rcu_sched_lock_key; struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC int debug_lockdep_rcu_enabled(void) { -- cgit v1.2.3-70-g09d2 From 99f88919f8fa8a8b01b5306c59c9977b94604df8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Mar 2013 16:54:14 -0700 Subject: rcu: Remove srcu_read_lock_raw() and srcu_read_unlock_raw(). These interfaces never did get used, so this commit removes them, their rcutorture tests, and documentation referencing them. Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan Reviewed-by: Josh Triplett --- Documentation/RCU/checklist.txt | 6 ------ Documentation/RCU/torture.txt | 6 ------ Documentation/RCU/whatisRCU.txt | 22 +++++++-------------- include/linux/srcu.h | 43 ----------------------------------------- kernel/rcutorture.c | 39 ------------------------------------- 5 files changed, 7 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 79e789b8b8e..7703ec73a9b 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -354,12 +354,6 @@ over a rather long period of time, but improvements are always welcome! using RCU rather than SRCU, because RCU is almost always faster and easier to use than is SRCU. - If you need to enter your read-side critical section in a - hardirq or exception handler, and then exit that same read-side - critical section in the task that was interrupted, then you need - to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid - the lockdep checking that would otherwise this practice illegal. - Also unlike other forms of RCU, explicit initialization and cleanup is required via init_srcu_struct() and cleanup_srcu_struct(). These are passed a "struct srcu_struct" diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 7dce8a17eac..d8a50238739 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -182,12 +182,6 @@ torture_type The type of RCU to test, with string values as follows: "srcu_expedited": srcu_read_lock(), srcu_read_unlock() and synchronize_srcu_expedited(). - "srcu_raw": srcu_read_lock_raw(), srcu_read_unlock_raw(), - and call_srcu(). - - "srcu_raw_sync": srcu_read_lock_raw(), srcu_read_unlock_raw(), - and synchronize_srcu(). - "sched": preempt_disable(), preempt_enable(), and call_rcu_sched(). diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 10df0b82f45..0f0fb7c432c 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -842,9 +842,7 @@ SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu srcu_barrier srcu_read_unlock call_srcu - srcu_read_lock_raw synchronize_srcu_expedited - srcu_read_unlock_raw - srcu_dereference + srcu_dereference synchronize_srcu_expedited SRCU: Initialization/cleanup init_srcu_struct @@ -865,38 +863,32 @@ list can be helpful: a. Will readers need to block? If so, you need SRCU. -b. Is it necessary to start a read-side critical section in a - hardirq handler or exception handler, and then to complete - this read-side critical section in the task that was - interrupted? If so, you need SRCU's srcu_read_lock_raw() and - srcu_read_unlock_raw() primitives. - -c. What about the -rt patchset? If readers would need to block +b. What about the -rt patchset? If readers would need to block in an non-rt kernel, you need SRCU. If readers would block in a -rt kernel, but not in a non-rt kernel, SRCU is not necessary. -d. Do you need to treat NMI handlers, hardirq handlers, +c. Do you need to treat NMI handlers, hardirq handlers, and code segments with preemption disabled (whether via preempt_disable(), local_irq_save(), local_bh_disable(), or some other mechanism) as if they were explicit RCU readers? If so, RCU-sched is the only choice that will work for you. -e. Do you need RCU grace periods to complete even in the face +d. Do you need RCU grace periods to complete even in the face of softirq monopolization of one or more of the CPUs? For example, is your code subject to network-based denial-of-service attacks? If so, you need RCU-bh. -f. Is your workload too update-intensive for normal use of +e. Is your workload too update-intensive for normal use of RCU, but inappropriate for other synchronization mechanisms? If so, consider SLAB_DESTROY_BY_RCU. But please be careful! -g. Do you need read-side critical sections that are respected +f. Do you need read-side critical sections that are respected even though they are in the middle of the idle loop, during user-mode execution, or on an offlined CPU? If so, SRCU is the only choice that will work for you. -h. Otherwise, use RCU. +g. Otherwise, use RCU. Of course, this all assumes that you have determined that RCU is in fact the right tool for your job. diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 04f4121a23a..c114614ed17 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -237,47 +237,4 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __srcu_read_unlock(sp, idx); } -/** - * srcu_read_lock_raw - register a new reader for an SRCU-protected structure. - * @sp: srcu_struct in which to register the new reader. - * - * Enter an SRCU read-side critical section. Similar to srcu_read_lock(), - * but avoids the RCU-lockdep checking. This means that it is legal to - * use srcu_read_lock_raw() in one context, for example, in an exception - * handler, and then have the matching srcu_read_unlock_raw() in another - * context, for example in the task that took the exception. - * - * However, the entire SRCU read-side critical section must reside within a - * single task. For example, beware of using srcu_read_lock_raw() in - * a device interrupt handler and srcu_read_unlock() in the interrupted - * task: This will not work if interrupts are threaded. - */ -static inline int srcu_read_lock_raw(struct srcu_struct *sp) -{ - unsigned long flags; - int ret; - - local_irq_save(flags); - ret = __srcu_read_lock(sp); - local_irq_restore(flags); - return ret; -} - -/** - * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure. - * @sp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock_raw(). - * - * Exit an SRCU read-side critical section without lockdep-RCU checking. - * See srcu_read_lock_raw() for more details. - */ -static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx) -{ - unsigned long flags; - - local_irq_save(flags); - __srcu_read_unlock(sp, idx); - local_irq_restore(flags); -} - #endif diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e1f3a8c9672..b1fa5510388 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = { .name = "srcu_sync" }; -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw" -}; - -static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw_sync" -}; - static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -1983,7 +1945,6 @@ rcu_torture_init(void) { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, - &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); -- cgit v1.2.3-70-g09d2 From 127781d1ba1ee5bbe1780afa35dd0e71583b143d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 08:44:00 -0700 Subject: rcu: Remove TINY_PREEMPT_RCU TINY_PREEMPT_RCU adds significant code and complexity, but does not offer commensurate benefits. People currently using TINY_PREEMPT_RCU can get much better memory footprint with TINY_RCU, or, if they really need preemptible RCU, they can use TREE_PREEMPT_RCU with a relatively minor degradation in memory footprint. Please note that this move has been widely publicized on LKML (https://lkml.org/lkml/2012/11/12/545) and on LWN (http://lwn.net/Articles/541037/). This commit therefore removes TINY_PREEMPT_RCU. Signed-off-by: Paul E. McKenney [ paulmck: Updated to eliminate #else in rcutiny.h as suggested by Josh ] Reviewed-by: Josh Triplett --- include/linux/hardirq.h | 2 +- include/linux/rcupdate.h | 2 +- include/linux/rcutiny.h | 24 +- init/Kconfig | 10 +- kernel/rcutiny_plugin.h | 854 ----------------------------------------------- 5 files changed, 5 insertions(+), 887 deletions(-) (limited to 'kernel') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index c1d6555d256..05bcc090376 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -128,7 +128,7 @@ extern void synchronize_irq(unsigned int irq); # define synchronize_irq(irq) barrier() #endif -#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) +#if defined(CONFIG_TINY_RCU) static inline void rcu_nmi_enter(void) { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ddcc7826d90..70b1522badd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -277,7 +277,7 @@ void wait_rcu_gp(call_rcu_func_t crf); #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include -#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) +#elif defined(CONFIG_TINY_RCU) #include #else #error "Unknown RCU implementation specified to kernel configuration" diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 4e56a9c69a3..d3c094ffa96 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -53,16 +53,7 @@ static inline void rcu_barrier(void) rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */ } -#else /* #ifdef CONFIG_TINY_RCU */ - -void synchronize_rcu_expedited(void); - -static inline void rcu_barrier(void) -{ - wait_rcu_gp(call_rcu); -} - -#endif /* #else #ifdef CONFIG_TINY_RCU */ +#endif /* #ifdef CONFIG_TINY_RCU */ static inline void synchronize_rcu_bh(void) { @@ -97,18 +88,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) return 0; } -#else /* #ifdef CONFIG_TINY_RCU */ - -void rcu_preempt_note_context_switch(void); -int rcu_preempt_needs_cpu(void); - -static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) -{ - *delta_jiffies = ULONG_MAX; - return rcu_preempt_needs_cpu(); -} - -#endif /* #else #ifdef CONFIG_TINY_RCU */ +#endif /* #ifdef CONFIG_TINY_RCU */ static inline void rcu_note_context_switch(int cpu) { diff --git a/init/Kconfig b/init/Kconfig index 2d9b83104dc..e7fb255413d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -459,18 +459,10 @@ config TINY_RCU is not required. This option greatly reduces the memory footprint of RCU. -config TINY_PREEMPT_RCU - bool "Preemptible UP-only small-memory-footprint RCU" - depends on PREEMPT && !SMP - help - This option selects the RCU implementation that is designed - for real-time UP systems. This option greatly reduces the - memory footprint of RCU. - endchoice config PREEMPT_RCU - def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU ) + def_bool TREE_PREEMPT_RCU help This option enables preemptible-RCU code that is common between the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8a233002fae..29a4dd78c8b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,763 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -#ifdef CONFIG_TINY_PREEMPT_RCU - -#include - -/* Global control variables for preemptible RCU. */ -struct rcu_preempt_ctrlblk { - struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ - struct rcu_head **nexttail; - /* Tasks blocked in a preemptible RCU */ - /* read-side critical section while an */ - /* preemptible-RCU grace period is in */ - /* progress must wait for a later grace */ - /* period. This pointer points to the */ - /* ->next pointer of the last task that */ - /* must wait for a later grace period, or */ - /* to &->rcb.rcucblist if there is no */ - /* such task. */ - struct list_head blkd_tasks; - /* Tasks blocked in RCU read-side critical */ - /* section. Tasks are placed at the head */ - /* of this list and age towards the tail. */ - struct list_head *gp_tasks; - /* Pointer to the first task blocking the */ - /* current grace period, or NULL if there */ - /* is no such task. */ - struct list_head *exp_tasks; - /* Pointer to first task blocking the */ - /* current expedited grace period, or NULL */ - /* if there is no such task. If there */ - /* is no current expedited grace period, */ - /* then there cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST - struct list_head *boost_tasks; - /* Pointer to first task that needs to be */ - /* priority-boosted, or NULL if no priority */ - /* boosting is needed. If there is no */ - /* current or expedited grace period, there */ - /* can be no such task. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - u8 gpnum; /* Current grace period. */ - u8 gpcpu; /* Last grace period blocked by the CPU. */ - u8 completed; /* Last grace period completed. */ - /* If all three are equal, RCU is idle. */ -#ifdef CONFIG_RCU_BOOST - unsigned long boost_time; /* When to start boosting (jiffies) */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#ifdef CONFIG_RCU_TRACE - unsigned long n_grace_periods; -#ifdef CONFIG_RCU_BOOST - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { - .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), - RCU_TRACE(.rcb.name = "rcu_preempt") -}; - -static int rcu_preempted_readers_exp(void); -static void rcu_report_exp_done(void); - -/* - * Return true if the CPU has not yet responded to the current grace period. - */ -static int rcu_cpu_blocking_cur_gp(void) -{ - return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Check for a running RCU reader. Because there is only one CPU, - * there can be but one running RCU reader at a time. ;-) - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section. - */ -static int rcu_preempt_running_reader(void) -{ - return current->rcu_read_lock_nesting; -} - -/* - * Check for preempted RCU readers blocking any grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_any(void) -{ - return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); -} - -/* - * Check for preempted RCU readers blocking the current grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_cgp(void) -{ - return rcu_preempt_ctrlblk.gp_tasks != NULL; -} - -/* - * Return true if another preemptible-RCU grace period is needed. - */ -static int rcu_preempt_needs_another_gp(void) -{ - return *rcu_preempt_ctrlblk.rcb.curtail != NULL; -} - -/* - * Return true if a preemptible-RCU grace period is in progress. - * The caller must disable hardirqs. - */ -static int rcu_preempt_gp_in_progress(void) -{ - return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Advance a ->blkd_tasks-list pointer to the next entry, instead - * returning NULL if at the end of the list. - */ -static struct list_head *rcu_next_node_entry(struct task_struct *t) -{ - struct list_head *np; - - np = t->rcu_node_entry.next; - if (np == &rcu_preempt_ctrlblk.blkd_tasks) - np = NULL; - return np; -} - -#ifdef CONFIG_RCU_TRACE - -#ifdef CONFIG_RCU_BOOST -static void rcu_initiate_boost_trace(void); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Dump additional statistice for TINY_PREEMPT_RCU. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ - seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", - rcu_preempt_ctrlblk.rcb.qlen, - rcu_preempt_ctrlblk.n_grace_periods, - rcu_preempt_ctrlblk.gpnum, - rcu_preempt_ctrlblk.gpcpu, - rcu_preempt_ctrlblk.completed, - "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], - "N."[!rcu_preempt_ctrlblk.gp_tasks], - "E."[!rcu_preempt_ctrlblk.exp_tasks]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", - " ", - "B."[!rcu_preempt_ctrlblk.boost_tasks], - rcu_preempt_ctrlblk.n_tasks_boosted, - rcu_preempt_ctrlblk.n_exp_boosts, - rcu_preempt_ctrlblk.n_normal_boosts, - (int)(jiffies & 0xffff), - (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", - " balk", - rcu_preempt_ctrlblk.n_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, - rcu_preempt_ctrlblk.n_balk_boost_tasks, - rcu_preempt_ctrlblk.n_balk_notyet, - rcu_preempt_ctrlblk.n_balk_nos); -#endif /* #ifdef CONFIG_RCU_BOOST */ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -#ifdef CONFIG_RCU_BOOST - -#include "rtmutex_common.h" - -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO - -/* Controls for rcu_kthread() kthread. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; - -/* - * Carry out RCU priority boosting on the task indicated by ->boost_tasks, - * and advance ->boost_tasks to the next task in the ->blkd_tasks list. - */ -static int rcu_boost(void) -{ - unsigned long flags; - struct rt_mutex mtx; - struct task_struct *t; - struct list_head *tb; - - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - return 0; /* Nothing to boost. */ - - local_irq_save(flags); - - /* - * Recheck with irqs disabled: all tasks in need of boosting - * might exit their RCU read-side critical sections on their own - * if we are preempted just before disabling irqs. - */ - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - local_irq_restore(flags); - return 0; - } - - /* - * Preferentially boost tasks blocking expedited grace periods. - * This cannot starve the normal grace periods because a second - * expedited grace period must boost all blocked tasks, including - * those blocking the pre-existing normal grace period. - */ - if (rcu_preempt_ctrlblk.exp_tasks != NULL) { - tb = rcu_preempt_ctrlblk.exp_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else { - tb = rcu_preempt_ctrlblk.boost_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); - } - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - - /* - * We boost task t by manufacturing an rt_mutex that appears to - * be held by task t. We leave a pointer to that rt_mutex where - * task t can find it, and task t will release the mutex when it - * exits its outermost RCU read-side critical section. Then - * simply acquiring this artificial rt_mutex will boost task - * t's priority. (Thanks to tglx for suggesting this approach!) - */ - t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; - local_irq_restore(flags); - rt_mutex_lock(&mtx); - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - - return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || - ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; -} - -/* - * Check to see if it is now time to start boosting RCU readers blocking - * the current grace period, and, if so, tell the rcu_kthread_task to - * start boosting them. If there is an expedited boost in progress, - * we wait for it to complete. - * - * If there are no blocked readers blocking the current grace period, - * return 0 to let the caller know, otherwise return 1. Note that this - * return value is independent of whether or not boosting was done. - */ -static int rcu_initiate_boost(void) -{ - if (!rcu_preempt_blocked_readers_cgp() && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); - return 0; - } - if (rcu_preempt_ctrlblk.exp_tasks != NULL || - (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { - if (rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_callbacks(); - } else { - RCU_TRACE(rcu_initiate_boost_trace()); - } - return 1; -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) - -/* - * Do priority-boost accounting for the start of a new grace period. - */ -static void rcu_preempt_boost_start_gp(void) -{ - rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * If there is no RCU priority boosting, we don't initiate boosting, - * but we do indicate whether there are blocked readers blocking the - * current grace period. - */ -static int rcu_initiate_boost(void) -{ - return rcu_preempt_blocked_readers_cgp(); -} - -/* - * If there is no RCU priority boosting, nothing to do at grace-period start. - */ -static void rcu_preempt_boost_start_gp(void) -{ -} - -#endif /* else #ifdef CONFIG_RCU_BOOST */ - -/* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. - * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - * - * Because this is a single-CPU implementation, the only way a grace - * period can end is if the CPU is in a quiescent state. The reason is - * that a blocked preemptible-RCU reader can exit its critical section - * only if the CPU is running it at the time. Therefore, when the - * last task blocking the current grace period exits its RCU read-side - * critical section, neither the CPU nor blocked tasks will be stopping - * the current grace period. (In contrast, SMP implementations - * might have CPUs running in RCU read-side critical sections that - * block later grace periods -- but this is not possible given only - * one CPU.) - */ -static void rcu_preempt_cpu_qs(void) -{ - /* Record both CPU and task as having responded to current GP. */ - rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - - /* If there is no GP then there is nothing more to do. */ - if (!rcu_preempt_gp_in_progress()) - return; - /* - * Check up on boosting. If there are readers blocking the - * current grace period, leave. - */ - if (rcu_initiate_boost()) - return; - - /* Advance callbacks. */ - rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; - rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; - - /* If there are no blocked readers, next GP is done instantly. */ - if (!rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - - /* If there are done callbacks, cause them to be invoked. */ - if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_callbacks(); -} - -/* - * Start a new RCU grace period if warranted. Hard irqs must be disabled. - */ -static void rcu_preempt_start_gp(void) -{ - if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { - - /* Official start of GP. */ - rcu_preempt_ctrlblk.gpnum++; - RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); - reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); - - /* Any blocked RCU readers block new GP. */ - if (rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.gp_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - - /* Set up for RCU priority boosting. */ - rcu_preempt_boost_start_gp(); - - /* If there is no running reader, CPU is done with GP. */ - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); - } -} - -/* - * We have entered the scheduler, and the current task might soon be - * context-switched away from. If this task is in an RCU read-side - * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the blkd_tasks list. - * If the task started after the current grace period began, as recorded - * by ->gpcpu, we enqueue at the beginning of the list. Otherwise - * before the element referenced by ->gp_tasks (or at the tail if - * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. - * The task will dequeue itself when it exits the outermost enclosing - * RCU read-side critical section. Therefore, the current grace period - * cannot be permitted to complete until the ->gp_tasks pointer becomes - * NULL. - * - * Caller must disable preemption. - */ -void rcu_preempt_note_context_switch(void) -{ - struct task_struct *t = current; - unsigned long flags; - - local_irq_save(flags); /* must exclude scheduler_tick(). */ - if (rcu_preempt_running_reader() > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { - - /* Possibly blocking in an RCU read-side critical section. */ - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - - /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. - */ - list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); - if (rcu_cpu_blocking_cur_gp()) - rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; - } else if (rcu_preempt_running_reader() < 0 && - t->rcu_read_unlock_special) { - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - } - - /* - * Either we were not in an RCU read-side critical section to - * begin with, or we have now recorded that critical section - * globally. Either way, we can now note a quiescent state - * for this CPU. Again, if we were in an RCU read-side critical - * section, and if that critical section was blocking the current - * grace period, then the fact that the task has been enqueued - * means that current grace period continues to be blocked. - */ - rcu_preempt_cpu_qs(); - local_irq_restore(flags); -} - -/* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. - */ -void rcu_read_unlock_special(struct task_struct *t) -{ - int empty; - int empty_exp; - unsigned long flags; - struct list_head *np; -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ - int special; - - /* - * NMI handlers cannot block and cannot safely manipulate state. - * They therefore cannot possibly be special, so just leave. - */ - if (in_nmi()) - return; - - local_irq_save(flags); - - /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. - */ - special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) - rcu_preempt_cpu_qs(); - - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { - local_irq_restore(flags); - return; - } - - /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - - /* - * Remove this task from the ->blkd_tasks list and adjust - * any pointers that might have been referencing it. - */ - empty = !rcu_preempt_blocked_readers_cgp(); - empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; - np = rcu_next_node_entry(t); - list_del_init(&t->rcu_node_entry); - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) - rcu_preempt_ctrlblk.gp_tasks = np; - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) - rcu_preempt_ctrlblk.exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) - rcu_preempt_ctrlblk.boost_tasks = np; -#endif /* #ifdef CONFIG_RCU_BOOST */ - - /* - * If this was the last task on the current list, and if - * we aren't waiting on the CPU, report the quiescent state - * and start a new grace period if needed. - */ - if (!empty && !rcu_preempt_blocked_readers_cgp()) { - rcu_preempt_cpu_qs(); - rcu_preempt_start_gp(); - } - - /* - * If this was the last task on the expedited lists, - * then we need wake up the waiting task. - */ - if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_report_exp_done(); - } -#ifdef CONFIG_RCU_BOOST - /* Unboost self if was boosted. */ - if (t->rcu_boost_mutex != NULL) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - rt_mutex_unlock(rbmp); - } -#endif /* #ifdef CONFIG_RCU_BOOST */ - local_irq_restore(flags); -} - -/* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the rcu_preempt_ctrlblk structure, which is - * checked elsewhere. This is called from the scheduling-clock interrupt. - * - * Caller must disable hard irqs. - */ -static void rcu_preempt_check_callbacks(void) -{ - struct task_struct *t = current; - - if (rcu_preempt_gp_in_progress() && - (!rcu_preempt_running_reader() || - !rcu_cpu_blocking_cur_gp())) - rcu_preempt_cpu_qs(); - if (&rcu_preempt_ctrlblk.rcb.rcucblist != - rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_callbacks(); - if (rcu_preempt_gp_in_progress() && - rcu_cpu_blocking_cur_gp() && - rcu_preempt_running_reader() > 0) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; -} - -/* - * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from rcu_process_callbacks() to - * handle that case. Of course, it is invoked for all flavors of - * RCU, but RCU callbacks can appear only on one of the lists, and - * neither ->nexttail nor ->donetail can possibly be NULL, so there - * is no need for an explicit check. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ - if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) - rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; -} - -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); -} - -/* - * Queue a preemptible -RCU callback for invocation after a grace period. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - local_irq_save(flags); - *rcu_preempt_ctrlblk.nexttail = head; - rcu_preempt_ctrlblk.nexttail = &head->next; - RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); - rcu_preempt_start_gp(); /* checks to see if GP needed. */ - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - if (!rcu_scheduler_active) - return; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - if (!rcu_preempt_blocked_readers_any()) - return; - - /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - if (rcu_expedited) - synchronize_rcu_expedited(); - else - rcu_barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(void) -{ - return rcu_preempt_ctrlblk.exp_tasks != NULL; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. - */ -static void rcu_report_exp_done(void) -{ - wake_up(&sync_rcu_preempt_exp_wq); -} - -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to rely in the fact that there is but one CPU, and that it is - * illegal for a task to invoke synchronize_rcu_expedited() while in a - * preemptible-RCU read-side critical section. Therefore, any such - * critical sections must correspond to blocked tasks, which must therefore - * be on the ->blkd_tasks list. So just record the current head of the - * list in the ->exp_tasks pointer, and wait for all tasks including and - * after the task pointed to by ->exp_tasks to drain. - */ -void synchronize_rcu_expedited(void) -{ - unsigned long flags; - struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; - unsigned long snap; - - barrier(); /* ensure prior action seen before grace period. */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - - /* - * Acquire lock so that there is only one preemptible RCU grace - * period in flight. Of course, if someone does the expedited - * grace period for us while we are acquiring the lock, just leave. - */ - snap = sync_rcu_preempt_exp_count + 1; - mutex_lock(&sync_rcu_preempt_exp_mutex); - if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) - goto unlock_mb_ret; /* Others did our work for us. */ - - local_irq_save(flags); - - /* - * All RCU readers have to already be on blkd_tasks because - * we cannot legally be executing in an RCU read-side critical - * section. - */ - - /* Snapshot current head of ->blkd_tasks list. */ - rpcp->exp_tasks = rpcp->blkd_tasks.next; - if (rpcp->exp_tasks == &rpcp->blkd_tasks) - rpcp->exp_tasks = NULL; - - /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) { - local_irq_restore(flags); - } else { - rcu_initiate_boost(); - local_irq_restore(flags); - wait_event(sync_rcu_preempt_exp_wq, - !rcu_preempted_readers_exp()); - } - - /* Clean up and exit. */ - barrier(); /* ensure expedited GP seen before counter increment. */ - sync_rcu_preempt_exp_count++; -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); - barrier(); /* ensure subsequent action seen after grace period. */ -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* - * Does preemptible RCU need the CPU to stay out of dynticks mode? - */ -int rcu_preempt_needs_cpu(void) -{ - return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; -} - -#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ - #ifdef CONFIG_RCU_TRACE /* @@ -895,79 +138,6 @@ static void rcu_preempt_process_callbacks(void) { } -#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST - -/* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_callbacks(void) -{ - have_rcu_kthread_work = 1; - if (rcu_kthread_task != NULL) - wake_up(&rcu_kthread_wq); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return rcu_kthread_task == current; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) -{ - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) - rcu_process_callbacks(NULL); - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ -} - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); - -#else /* #ifdef CONFIG_RCU_BOOST */ - /* Hold off callback invocation until early_initcall() time. */ static int rcu_scheduler_fully_active __read_mostly; @@ -1001,8 +171,6 @@ static int __init rcu_scheduler_really_started(void) } early_initcall(rcu_scheduler_really_started); -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC #include @@ -1020,25 +188,6 @@ void __init rcu_scheduler_starting(void) #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_BOOST - -static void rcu_initiate_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_balk_blkd_tasks++; - else if (rcu_preempt_ctrlblk.gp_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; - else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_balk_boost_tasks++; - else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_balk_notyet++; - else - rcu_preempt_ctrlblk.n_balk_nos++; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; @@ -1105,9 +254,6 @@ MODULE_LICENSE("GPL"); static void check_cpu_stall_preempt(void) { -#ifdef CONFIG_TINY_PREEMPT_RCU - check_cpu_stall(&rcu_preempt_ctrlblk.rcb); -#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3-70-g09d2 From 221304e95e1466fb49b630f67a719cc735ec5353 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 08:59:52 -0700 Subject: rcu: Remove show_tiny_preempt_stats() With the removal of CONFIG_TINY_PREEMPT_RCU, show_tiny_preempt_stats() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 29a4dd78c8b..cf0bc22434c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,18 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -#ifdef CONFIG_RCU_TRACE - -/* - * Because preemptible RCU does not exist, it is not necessary to - * dump out its statistics. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -202,7 +190,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) */ static int show_tiny_stats(struct seq_file *m, void *unused) { - show_tiny_preempt_stats(m); seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); return 0; -- cgit v1.2.3-70-g09d2 From 9acaac8ced57be8312cbf9f2a1e4f5e23b363493 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 09:02:40 -0700 Subject: rcu: Remove rcu_preempt_check_callbacks() With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_check_callbacks() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index a0714a51b6d..91782827775 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -257,7 +257,6 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); - rcu_preempt_check_callbacks(); } /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index cf0bc22434c..404b3a31e51 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,14 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. - */ -static void rcu_preempt_check_callbacks(void) -{ -} - /* * Because preemptible RCU does not exist, it never has any callbacks * to remove. -- cgit v1.2.3-70-g09d2 From 47d65935a7f26f24417585e872e254c7ecc6596f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 09:05:34 -0700 Subject: rcu: Remove rcu_preempt_remove_callbacks() With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_remove_callbacks() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 91782827775..6f5a2a6cc63 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -289,7 +289,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; - rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 404b3a31e51..8b835b98114 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,14 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to remove. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ -} - /* * Because preemptible RCU does not exist, it never has any callbacks * to process. -- cgit v1.2.3-70-g09d2 From 58c4e69d43df91fd6a55bc070474aad6b7cfb18d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 09:11:12 -0700 Subject: rcu: Remove rcu_preempt_process_callbacks() With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_process_callbacks() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 6f5a2a6cc63..7fc2339b085 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -314,7 +314,6 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); } /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8b835b98114..bfe99240780 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,14 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - /* Hold off callback invocation until early_initcall() time. */ static int rcu_scheduler_fully_active __read_mostly; -- cgit v1.2.3-70-g09d2 From 9dc5ad32488a75504349372330cc228d4dd678db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 10:11:15 -0700 Subject: rcu: Simplify RCU_TINY RCU callback invocation TINY_PREEMPT_RCU could use a kthread to handle RCU callback invocation, which required an API to abstract kthread vs. softirq invocation. Now that TINY_PREEMPT_RCU is no longer with us, this commit retires this API in favor of direct use of the relevant softirq primitives. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + include/linux/rcutiny.h | 4 ---- include/linux/rcutree.h | 1 - kernel/rcutiny.c | 14 +++++++++----- kernel/rcutiny_plugin.h | 33 --------------------------------- 5 files changed, 10 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 70b1522badd..257b50f9d2d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -216,6 +216,7 @@ static inline int rcu_preempt_depth(void) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ +extern void rcu_init(void); extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d3c094ffa96..e756251b39b 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,10 +27,6 @@ #include -static inline void rcu_init(void) -{ -} - static inline void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 952b7933930..3f1aa8f9866 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,7 +30,6 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -extern void rcu_init(void); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies); extern void rcu_cpu_stall_reset(void); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7fc2339b085..4adc9e26da3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -44,7 +44,6 @@ /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_callbacks(void); static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, @@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -277,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); return; } @@ -307,7 +306,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); } static void rcu_process_callbacks(struct softirq_action *unused) @@ -379,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index bfe99240780..36fd83c544c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,39 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* Hold off callback invocation until early_initcall() time. */ -static int rcu_scheduler_fully_active __read_mostly; - -/* - * Start up softirq processing of callbacks. - */ -void invoke_rcu_callbacks(void) -{ - if (rcu_scheduler_fully_active) - raise_softirq(RCU_SOFTIRQ); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * There is no callback kthread, so this thread is never it. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static int __init rcu_scheduler_really_started(void) -{ - rcu_scheduler_fully_active = 1; - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ - return 0; -} -early_initcall(rcu_scheduler_really_started); - #ifdef CONFIG_DEBUG_LOCK_ALLOC #include -- cgit v1.2.3-70-g09d2 From 4879c84daa7bd6757b99ef76b30d4fcebccfcc6f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 10:18:04 -0700 Subject: rcu: Remove check_cpu_stall_preempt() With the removal of CONFIG_TINY_PREEMPT_RCU, check_cpu_stall_preempt() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 36fd83c544c..bac3a6ecb99 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -82,8 +82,6 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } -static void check_cpu_stall_preempt(void); - #endif /* #ifdef CONFIG_RCU_TRACE */ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) @@ -99,7 +97,6 @@ static void check_cpu_stalls(void) { RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); - RCU_TRACE(check_cpu_stall_preempt()); } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -182,8 +179,4 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); -static void check_cpu_stall_preempt(void) -{ -} - #endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3-70-g09d2 From 318bdcd95938ec3a530fc789da662ce159d50d46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 10:43:02 -0700 Subject: rcu: Consolidate rcutiny_plugin.h ifdefs This commit rearranges code in order to allow ifdefs to be consolidated in kernel/rcutiny_plugin.h, simplifying the code. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 86 +++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index bac3a6ecb99..65ef1800f4f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -53,54 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC +#include + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = rcp->jiffies_stall; - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ -#ifdef CONFIG_RCU_TRACE - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#include /* * During boot, we forgive RCU lockdep issues. After this function is @@ -179,4 +135,42 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + #endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); +} -- cgit v1.2.3-70-g09d2 From 2439b696cb5303f1eeb6aeebcee19e0056c3dd6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2013 10:15:52 -0700 Subject: rcu: Shrink TINY_RCU by moving exit_rcu() Now that TINY_PREEMPT_RCU is no more, exit_rcu() is always an empty function. But if TINY_RCU is going to have an empty function, it should be in include/linux/rcutiny.h, where it does not bloat the kernel. This commit therefore moves exit_rcu() out of kernel/rcupdate.c to kernel/rcutree_plugin.h, and places a static inline empty function in include/linux/rcutiny.h in order to shrink TINY_RCU a bit. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 2 -- include/linux/rcutiny.h | 4 ++++ include/linux/rcutree.h | 2 ++ kernel/rcupdate.c | 26 +------------------------- kernel/rcutree_plugin.h | 26 ++++++++++++++++++++++++++ 5 files changed, 33 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 257b50f9d2d..4b14bdc911d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -240,8 +240,6 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, struct task_struct *next) { } #endif /* CONFIG_RCU_USER_QS */ -extern void exit_rcu(void); - /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers * @a: Code that RCU needs to pay attention to. diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 51230b63be9..e31005ee339 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -119,6 +119,10 @@ static inline void rcu_cpu_stall_reset(void) { } +static inline void exit_rcu(void) +{ +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC extern int rcu_scheduler_active __read_mostly; extern void rcu_scheduler_starting(void); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3f1aa8f9866..226169d1bd2 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -85,6 +85,8 @@ extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); +extern void exit_rcu(void); + extern void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4..0be1fa2ea52 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -104,31 +104,7 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (likely(list_empty(¤t->rcu_node_entry))) - return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; - __rcu_read_unlock(); -} - -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 207844ea022..de701bbdb62 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -932,6 +932,24 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; @@ -1100,6 +1118,14 @@ static void __init __rcu_init_preempt(void) { } +/* + * Because preemptible RCU does not exist, tasks cannot possibly exit + * while in preemptible RCU read-side critical sections. + */ +void exit_rcu(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST -- cgit v1.2.3-70-g09d2 From 14961444696effb2e660fe876e5c1880f8bc3932 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Apr 2013 07:49:22 -0700 Subject: rcu: Shrink TINY_RCU by reworking CPU-stall ifdefs TINY_RCU's reset_cpu_stall_ticks() and check_cpu_stalls() functions are defined unconditionally, and are empty functions if CONFIG_RCU_TRACE is disabled (which in turns disables detection of RCU CPU stalls). This commit saves a few lines of source code by defining these functions only if CONFIG_RCU_TRACE=y. Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++-- kernel/rcutiny_plugin.h | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 4adc9e26da3..aa344111de3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -204,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - reset_cpu_stall_ticks(rcp); + RCU_TRACE(reset_cpu_stall_ticks(rcp)); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -251,7 +251,7 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { - check_cpu_stalls(); + RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 65ef1800f4f..0cd385acccf 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -158,15 +158,11 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } -#endif /* #ifdef CONFIG_RCU_TRACE */ - static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { -#ifdef CONFIG_RCU_TRACE rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ } static void check_cpu_stalls(void) @@ -174,3 +170,5 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); } + +#endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3-70-g09d2 From ee23871389d51e07380d23887333622fbe7d3dd9 Mon Sep 17 00:00:00 2001 From: Ivo Sieben Date: Mon, 3 Jun 2013 12:12:02 +0200 Subject: genirq: Set irq thread to RT priority on creation When a threaded irq handler is installed the irq thread is initially created on normal scheduling priority. Only after the irq thread is woken up it sets its priority to RT_FIFO MAX_USER_RT_PRIO/2 itself. This means that interrupts that occur directly after the irq handler is installed will be handled on a normal scheduling priority instead of the realtime priority that one would expect. Fix this by setting the RT priority on creation of the irq_thread. Signed-off-by: Ivo Sieben Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1370254322-17240-1-git-send-email-meltedpianoman@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa17855ca65..e16caa81f88 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused) static int irq_thread(void *data) { struct callback_head on_exit_work; - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, @@ -854,8 +851,6 @@ static int irq_thread(void *data) else handler_fn = irq_thread_fn; - sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); @@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (new->thread_fn && !nested) { struct task_struct *t; + static const struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); @@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = PTR_ERR(t); goto out_mput; } + + sched_setscheduler(t, SCHED_FIFO, ¶m); + /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code -- cgit v1.2.3-70-g09d2 From 9350de06be45a5a8b927ac6577c9d35de61c90ca Mon Sep 17 00:00:00 2001 From: Bernie Thompson Date: Sat, 1 Jun 2013 00:47:43 +0000 Subject: PM / wakeup: Adjust messaging for wake events during suspend This adds in a new message to the wakeup code which adds an indication to the log that suspend was cancelled due to a wake event occouring during the suspend sequence. It also adjusts the message printed in suspend.c to reflect the potential that a suspend was aborted, as opposed to a device failing to suspend. Without these message adjustments one can end up with a kernel log that says that a device failed to suspend with no actual device suspend failures, which can be confusing to the log examiner. Signed-off-by: Bernie Thompson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 4 +++- kernel/power/suspend.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 79715e7fa43..407a2efa10b 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -707,8 +707,10 @@ bool pm_wakeup_pending(void) } spin_unlock_irqrestore(&events_lock, flags); - if (ret) + if (ret) { + pr_info("PM: Wakeup pending, aborting suspend\n"); print_active_wakeup_sources(); + } return ret; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bef86d121eb..ece04223bb1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); + pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); goto Recover_platform; } suspend_test_finish("suspend devices"); -- cgit v1.2.3-70-g09d2 From ad71d889b88055e61e3970a6744a271a51a94f42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 15:46:14 -0400 Subject: tracing: Add function probe to trigger a ftrace dump to console Add the "dump" command to have the ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. Format is: :dump echo 'bad_address:dump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:dump' > /debug/tracing/set_ftrace_filter Requested-by: Luis Claudio R. Goncalves Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 7 +++++ kernel/trace/trace_functions.c | 59 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index bfe8c29b1f1..cc9ec57e157 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -2430,6 +2430,13 @@ The following commands are supported: echo '!schedule:disable_event:sched:sched_switch' > \ set_ftrace_filter +- dump + When the function is hit, it will dump the contents of the ftrace + ring buffer to the console. This is useful if you need to debug + something, and want to dump the trace when a certain function + is hit. Perhaps its a function that is called before a tripple + fault happens and does not allow you to get a regular dump. + trace_pipe ---------- diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d719198..d7c8719734b 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,6 +290,13 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) trace_dump_stack(STACK_SKIP); } +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -327,6 +334,13 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("stacktrace", m, ip, data); } +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -342,6 +356,11 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = { .print = ftrace_stacktrace_print, }; +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -425,6 +444,19 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash, param, enable); } +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -440,6 +472,11 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = { .func = ftrace_stacktrace_callback, }; +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -450,13 +487,25 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); + goto out_free_traceoff; ret = register_ftrace_command(&ftrace_stacktrace_cmd); - if (ret) { - unregister_ftrace_command(&ftrace_traceoff_cmd); - unregister_ftrace_command(&ftrace_traceon_cmd); - } + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + return 0; + + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; } #else -- cgit v1.2.3-70-g09d2 From 90e3c03c3a09a7b176b3fe59d78f5d9755ac8e37 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 19:00:46 -0400 Subject: tracing: Add function probe to trigger a ftrace dump of current CPU trace Add the "cpudump" command to have the current CPU ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. This differs from the "dump" command as it only dumps the content of the ring buffer for the currently executing CPU, and does not show the contents of the other CPUs. Format is: :cpudump echo 'bad_address:cpudump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:cpudump' > /debug/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 6 ++++++ kernel/trace/trace_functions.c | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index cc9ec57e157..b937c6e2163 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -2437,6 +2437,12 @@ The following commands are supported: is hit. Perhaps its a function that is called before a tripple fault happens and does not allow you to get a regular dump. +- cpudump + When the function is hit, it will dump the contents of the ftrace + ring buffer for the current CPU to the console. Unlike the "dump" + command, it only prints out the contents of the ring buffer for the + CPU that executed the function that triggered the dump. + trace_pipe ---------- diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d7c8719734b..b863f93b30f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -297,6 +297,14 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) ftrace_dump(DUMP_ALL); } +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -341,6 +349,13 @@ ftrace_dump_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("dump", m, ip, data); } +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -361,6 +376,11 @@ static struct ftrace_probe_ops dump_probe_ops = { .print = ftrace_dump_print, }; +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -457,6 +477,19 @@ ftrace_dump_callback(struct ftrace_hash *hash, "1", enable); } +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -477,6 +510,11 @@ static struct ftrace_func_command ftrace_dump_cmd = { .func = ftrace_dump_callback, }; +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -497,8 +535,14 @@ static int __init init_func_cmd_traceon(void) if (ret) goto out_free_stacktrace; + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + return 0; + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); out_free_stacktrace: unregister_ftrace_command(&ftrace_stacktrace_cmd); out_free_traceon: -- cgit v1.2.3-70-g09d2 From 8092e808a31839c502a52d391b15f31c1d8764f5 Mon Sep 17 00:00:00 2001 From: Harsh Prateek Bora Date: Fri, 24 May 2013 12:52:17 +0530 Subject: tracing/trivial: Consolidate error return condition Consolidate the checks for !enabled and !param to return -EINVAL in event_enable_func(). Link: http://lkml.kernel.org/r/1369380137-12452-1-git-send-email-harsh@linux.vnet.ibm.com Signed-off-by: Harsh Prateek Bora Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4b..db086f172cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2011,10 +2011,7 @@ event_enable_func(struct ftrace_hash *hash, int ret; /* hash funcs only work with set_ftrace_filter */ - if (!enabled) - return -EINVAL; - - if (!param) + if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); -- cgit v1.2.3-70-g09d2 From 238ae93d699d59876b470bf6455de22bcfaa9a1b Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Sun, 26 May 2013 16:52:01 +0800 Subject: tracing: Fix file mode of free_buffer Commit 4f271a2a60c748599b30bb4dafff30d770439b96 (tracing: Add a proc file to stop tracing and free buffer) implement a method to free up ring buffer in kernel memory in the release code path of free_buffer's fd. Then we don't need read/write support for free_buffer, indeed we just have a dummy write fop, and don't implement read fop. So the 0200 is more reasonable file mode for free_buffer than the current file mode 0644. Link: http://lkml.kernel.org/r/20130526085201.GA3183@udknight Acked-by: Vaibhav Nagarnaik Acked-by: David Sharp Signed-off-by: Wang YanQing Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f8..5f4a09c12e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5935,7 +5935,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); - trace_create_file("free_buffer", 0644, d_tracer, + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, -- cgit v1.2.3-70-g09d2 From 7614c3dc74733dff4b0e774f7a894b9ea6ec508c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 20:01:16 -0400 Subject: ftrace: Use schedule_on_each_cpu() as a heavy synchronize_sched() The function tracer uses preempt_disable/enable_notrace() for synchronization between reading registered ftrace_ops and unregistering them. Most of the ftrace_ops are global permanent structures that do not require this synchronization. That is, ops may be added and removed from the hlist but are never freed, and wont hurt if a synchronization is missed. But this is not true for dynamically created ftrace_ops or control_ops, which are used by the perf function tracing. The problem here is that the function tracer can be used to trace kernel/user context switches as well as going to and from idle. Basically, it can be used to trace blind spots of the RCU subsystem. This means that even though preempt_disable() is done, a synchronize_sched() will ignore CPUs that haven't made it out of user space or idle. These can include functions that are being traced just before entering or exiting the kernel sections. To implement the RCU synchronization, instead of using synchronize_sched() the use of schedule_on_each_cpu() is performed. This means that when a dynamically allocated ftrace_ops, or a control ops is being unregistered, all CPUs must be touched and execute a ftrace_sync() stub function via the work queues. This will rip CPUs out from idle or in dynamic tick mode. This only happens when a user disables perf function tracing or other dynamically allocated function tracers, but it allows us to continue to debug RCU and context tracking with function tracing. Link: http://lkml.kernel.org/r/1369785676.15552.55.camel@gandalf.local.home Cc: "Paul E. McKenney" Cc: Tejun Heo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c6..800a8a2fbdd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * so there'll be no new users. We must ensure * all current users are done before we free * the control data. + * Note synchronize_sched() is not enough, as we + * use preempt_disable() to do RCU, but the function + * tracer can be called where RCU is not active + * (before user_exit()). */ - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); control_ops_free(ops); } } else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); + return 0; } -- cgit v1.2.3-70-g09d2 From aaf6ac0f0871cb7fc0f28f3a00edf329bc7adc29 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 7 Jun 2013 15:07:48 +0900 Subject: tracing: Do not call kmem_cache_free() on allocation failure There's no point calling it when _alloc() failed. Link: http://lkml.kernel.org/r/1370585268-29169-1-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db086f172cf..f57b01574a3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,7 +97,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) - goto err; + return -ENOMEM; field->name = name; field->type = type; @@ -114,11 +114,6 @@ static int __trace_define_field(struct list_head *head, const char *type, list_add(&field->link, head); return 0; - -err: - kmem_cache_free(field_cachep, field); - - return -ENOMEM; } int trace_define_field(struct ftrace_event_call *call, const char *type, -- cgit v1.2.3-70-g09d2 From 11682a41618f8094cb7a9330b4b6a12ffaef5774 Mon Sep 17 00:00:00 2001 From: Marcus Gelderie Date: Tue, 4 Jun 2013 09:32:09 +0200 Subject: alarmtimer: Export symbols of functions declared in linux/alarmtimer.h Export symbols so they can be used by drivers/staging/android/alarm-dev.c if it is built as a module. So far alarm-dev is built-in but module support is planned (see drivers/staging/android/TODO). Signed-off-by: Marcus Gelderie [jstultz: tweaked commit message, also export newly added functions] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 3e5cba27447..eec50fcef9e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -204,6 +204,7 @@ ktime_t alarm_expires_remaining(const struct alarm *alarm) struct alarm_base *base = &alarm_bases[alarm->type]; return ktime_sub(alarm->node.expires, base->gettime()); } +EXPORT_SYMBOL_GPL(alarm_expires_remaining); #ifdef CONFIG_RTC_CLASS /** @@ -309,6 +310,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } +EXPORT_SYMBOL_GPL(alarm_init); /** * alarm_start - Sets an absolute alarm to fire @@ -329,6 +331,7 @@ int alarm_start(struct alarm *alarm, ktime_t start) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_start); /** * alarm_start_relative - Sets a relative alarm to fire @@ -342,6 +345,7 @@ int alarm_start_relative(struct alarm *alarm, ktime_t start) start = ktime_add(start, base->gettime()); return alarm_start(alarm, start); } +EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { @@ -354,6 +358,7 @@ void alarm_restart(struct alarm *alarm) alarmtimer_enqueue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -375,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** @@ -392,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) cpu_relax(); } } +EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -424,6 +431,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add(alarm->node.expires, interval); return overrun; } +EXPORT_SYMBOL_GPL(alarm_forward); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { @@ -431,7 +439,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) return alarm_forward(alarm, base->gettime(), interval); } - +EXPORT_SYMBOL_GPL(alarm_forward_now); /** -- cgit v1.2.3-70-g09d2 From 38ff87f77af0b5a93fc8581cff1d6e5692ab8970 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 1 Jun 2013 23:39:40 -0700 Subject: sched_clock: Make ARM's sched_clock generic for all architectures Nothing about the sched_clock implementation in the ARM port is specific to the architecture. Generalize the code so that other architectures can use it by selecting GENERIC_SCHED_CLOCK. Signed-off-by: Stephen Boyd [jstultz: Merge minor collisions with other patches in my tree] Signed-off-by: John Stultz --- arch/arm/Kconfig | 1 + arch/arm/common/timer-sp.c | 2 +- arch/arm/include/asm/sched_clock.h | 16 --- arch/arm/kernel/Makefile | 2 +- arch/arm/kernel/arch_timer.c | 2 +- arch/arm/kernel/sched_clock.c | 216 ------------------------------ arch/arm/kernel/time.c | 4 +- arch/arm/mach-davinci/time.c | 2 +- arch/arm/mach-imx/time.c | 2 +- arch/arm/mach-integrator/integrator_ap.c | 2 +- arch/arm/mach-ixp4xx/common.c | 2 +- arch/arm/mach-mmp/time.c | 2 +- arch/arm/mach-msm/timer.c | 2 +- arch/arm/mach-omap1/time.c | 2 +- arch/arm/mach-omap2/timer.c | 2 +- arch/arm/mach-pxa/time.c | 2 +- arch/arm/mach-sa1100/time.c | 2 +- arch/arm/mach-u300/timer.c | 2 +- arch/arm/plat-iop/time.c | 2 +- arch/arm/plat-omap/counter_32k.c | 2 +- arch/arm/plat-orion/time.c | 2 +- arch/arm/plat-samsung/samsung-time.c | 2 +- arch/arm/plat-versatile/sched-clock.c | 2 +- drivers/clocksource/bcm2835_timer.c | 2 +- drivers/clocksource/clksrc-dbx500-prcmu.c | 3 +- drivers/clocksource/dw_apb_timer_of.c | 3 +- drivers/clocksource/mxs_timer.c | 2 +- drivers/clocksource/nomadik-mtu.c | 2 +- drivers/clocksource/samsung_pwm_timer.c | 2 +- drivers/clocksource/tegra20_timer.c | 2 +- drivers/clocksource/time-armada-370-xp.c | 2 +- drivers/clocksource/timer-marco.c | 2 +- drivers/clocksource/timer-prima2.c | 2 +- include/linux/sched_clock.h | 21 +++ init/Kconfig | 3 + init/main.c | 2 + kernel/time/Makefile | 1 + kernel/time/sched_clock.c | 215 +++++++++++++++++++++++++++++ 38 files changed, 273 insertions(+), 266 deletions(-) delete mode 100644 arch/arm/include/asm/sched_clock.h delete mode 100644 arch/arm/kernel/sched_clock.c create mode 100644 include/linux/sched_clock.h create mode 100644 kernel/time/sched_clock.c (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 49d993cee51..53d3a356f61 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -14,6 +14,7 @@ config ARM select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP + select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_STRNCPY_FROM_USER diff --git a/arch/arm/common/timer-sp.c b/arch/arm/common/timer-sp.c index ddc74076960..023ee63827a 100644 --- a/arch/arm/common/timer-sp.c +++ b/arch/arm/common/timer-sp.c @@ -28,8 +28,8 @@ #include #include #include +#include -#include #include #include diff --git a/arch/arm/include/asm/sched_clock.h b/arch/arm/include/asm/sched_clock.h deleted file mode 100644 index 3d520ddca61..00000000000 --- a/arch/arm/include/asm/sched_clock.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * sched_clock.h: support for extending counters to full 64-bit ns counter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef ASM_SCHED_CLOCK -#define ASM_SCHED_CLOCK - -extern void sched_clock_postinit(void); -extern void setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate); - -extern unsigned long long (*sched_clock_func)(void); - -#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 5f3338eacad..97cb0576d07 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -16,7 +16,7 @@ CFLAGS_REMOVE_return_address.o = -pg # Object file lists. obj-y := elf.o entry-armv.o entry-common.o irq.o opcodes.o \ - process.o ptrace.o return_address.o sched_clock.o \ + process.o ptrace.o return_address.o \ setup.o signal.o stacktrace.o sys_arm.o time.o traps.o obj-$(CONFIG_ATAGS) += atags_parse.o diff --git a/arch/arm/kernel/arch_timer.c b/arch/arm/kernel/arch_timer.c index 59dcdced6e3..221f07b11cc 100644 --- a/arch/arm/kernel/arch_timer.c +++ b/arch/arm/kernel/arch_timer.c @@ -11,9 +11,9 @@ #include #include #include +#include #include -#include #include diff --git a/arch/arm/kernel/sched_clock.c b/arch/arm/kernel/sched_clock.c deleted file mode 100644 index a781c59b93c..00000000000 --- a/arch/arm/kernel/sched_clock.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * sched_clock.c: support for extending counters to full 64-bit ns counter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct clock_data { - u64 epoch_ns; - u32 epoch_cyc; - u32 epoch_cyc_copy; - unsigned long rate; - u32 mult; - u32 shift; - bool suspended; -}; - -static void sched_clock_poll(unsigned long wrap_ticks); -static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); -static int irqtime = -1; - -core_param(irqtime, irqtime, int, 0400); - -static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u32 __read_mostly sched_clock_mask = 0xffffffff; - -static u32 notrace jiffy_sched_clock_read(void) -{ - return (u32)(jiffies - INITIAL_JIFFIES); -} - -static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; - -static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) -{ - return (cyc * mult) >> shift; -} - -static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) -{ - u64 epoch_ns; - u32 epoch_cyc; - - /* - * Load the epoch_cyc and epoch_ns atomically. We do this by - * ensuring that we always write epoch_cyc, epoch_ns and - * epoch_cyc_copy in strict order, and read them in strict order. - * If epoch_cyc and epoch_cyc_copy are not equal, then we're in - * the middle of an update, and we should repeat the load. - */ - do { - epoch_cyc = cd.epoch_cyc; - smp_rmb(); - epoch_ns = cd.epoch_ns; - smp_rmb(); - } while (epoch_cyc != cd.epoch_cyc_copy); - - return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); -} - -/* - * Atomically update the sched_clock epoch. - */ -static void notrace update_sched_clock(void) -{ - unsigned long flags; - u32 cyc; - u64 ns; - - cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); - /* - * Write epoch_cyc and epoch_ns in a way that the update is - * detectable in cyc_to_fixed_sched_clock(). - */ - raw_local_irq_save(flags); - cd.epoch_cyc_copy = cyc; - smp_wmb(); - cd.epoch_ns = ns; - smp_wmb(); - cd.epoch_cyc = cyc; - raw_local_irq_restore(flags); -} - -static void sched_clock_poll(unsigned long wrap_ticks) -{ - mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); - update_sched_clock(); -} - -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ - unsigned long r, w; - u64 res, wrap; - char r_unit; - - if (cd.rate > rate) - return; - - BUG_ON(bits > 32); - WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = (1 << bits) - 1; - cd.rate = rate; - - /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); - - r = rate; - if (r >= 4000000) { - r /= 1000000; - r_unit = 'M'; - } else if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else - r_unit = ' '; - - /* calculate how many ns until we wrap */ - wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); - do_div(wrap, NSEC_PER_MSEC); - w = wrap; - - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); - pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", - bits, r, r_unit, res, w); - - /* - * Start the timer to keep sched_clock() properly updated and - * sets the initial epoch. - */ - sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - - /* Enable IRQ time accounting if we have a fast enough sched_clock */ - if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) - enable_sched_clock_irqtime(); - - pr_debug("Registered %pF as sched_clock source\n", read); -} - -static unsigned long long notrace sched_clock_32(void) -{ - u32 cyc = read_sched_clock(); - return cyc_to_sched_clock(cyc, sched_clock_mask); -} - -unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; - -unsigned long long notrace sched_clock(void) -{ - if (cd.suspended) - return cd.epoch_ns; - - return sched_clock_func(); -} - -void __init sched_clock_postinit(void) -{ - /* - * If no sched_clock function has been provided at that point, - * make it the final one one. - */ - if (read_sched_clock == jiffy_sched_clock_read) - setup_sched_clock(jiffy_sched_clock_read, 32, HZ); - - sched_clock_poll(sched_clock_timer.data); -} - -static int sched_clock_suspend(void) -{ - sched_clock_poll(sched_clock_timer.data); - cd.suspended = true; - return 0; -} - -static void sched_clock_resume(void) -{ - cd.epoch_cyc = read_sched_clock(); - cd.epoch_cyc_copy = cd.epoch_cyc; - cd.suspended = false; -} - -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, -}; - -static int __init sched_clock_syscore_init(void) -{ - register_syscore_ops(&sched_clock_ops); - return 0; -} -device_initcall(sched_clock_syscore_init); diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index abff4e9aaee..98aee325839 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -24,9 +24,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -120,6 +120,4 @@ void __init time_init(void) machine_desc->init_time(); else clocksource_of_init(); - - sched_clock_postinit(); } diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c index bad361ec166..7a55b5c9597 100644 --- a/arch/arm/mach-davinci/time.c +++ b/arch/arm/mach-davinci/time.c @@ -18,8 +18,8 @@ #include #include #include +#include -#include #include #include diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c index fea91313678..cd46529e9ea 100644 --- a/arch/arm/mach-imx/time.c +++ b/arch/arm/mach-imx/time.c @@ -26,8 +26,8 @@ #include #include #include +#include -#include #include #include "common.h" diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c index b23c8e4f28e..aa4346227c4 100644 --- a/arch/arm/mach-integrator/integrator_ap.c +++ b/arch/arm/mach-integrator/integrator_ap.c @@ -41,6 +41,7 @@ #include #include #include +#include #include