From 072ba49838b42c873c496d72c91bb237914cf3b6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sun, 26 Oct 2008 15:26:57 -0700 Subject: ftrace: fix breakage in bin_fmt results In 777e208d40d0953efc6fb4ab58590da3f7d8f02d we changed from outputting field->cpu (a char) to iter->cpu (unsigned int), increasing the resulting structure size by 3 bytes. Signed-off-by: Eric Anholt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478f917..974973e39e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1755,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); + SEQ_PUT_FIELD_RET(s, entry->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { -- cgit v1.2.3-70-g09d2 From 2d3854a37e8b767a51aba38ed6d22817b0631e33 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 5 Nov 2008 13:39:10 +1100 Subject: cpumask: introduce new API, without changing anything Impact: introduce new APIs We want to deprecate cpumasks on the stack, as we are headed for gynormous numbers of CPUs. Eventually, we want to head towards an undefined 'struct cpumask' so they can never be declared on stack. 1) New cpumask functions which take pointers instead of copies. (cpus_* -> cpumask_*) 2) Several new helpers to reduce requirements for temporary cpumasks (cpumask_first_and, cpumask_next_and, cpumask_any_and) 3) Helpers for declaring cpumasks on or offstack for large NR_CPUS (cpumask_var_t, alloc_cpumask_var and free_cpumask_var) 4) 'struct cpumask' for explicitness and to mark new-style code. 5) Make iterator functions stop at nr_cpu_ids (a runtime constant), not NR_CPUS for time efficiency and for smaller dynamic allocations in future. 6) cpumask_copy() so we can allocate less than a full cpumask eventually (for alloc_cpumask_var), and so we can eliminate the 'struct cpumask' definition eventually. 7) work_on_cpu() helper for doing task on a CPU, rather than saving old cpumask for current thread and manipulating it. 8) smp_call_function_many() which is smp_call_function_mask() except taking a cpumask pointer. Note that this patch simply introduces the new functions and leaves the obsolescent ones in place. This is to simplify the transition patches. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 502 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/smp.h | 9 + include/linux/workqueue.h | 8 + kernel/cpu.c | 3 + kernel/workqueue.c | 45 +++++ lib/cpumask.c | 73 +++++++ 6 files changed, 638 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d3219d73f8e..c8e66619097 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -5,6 +5,9 @@ * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. * + * The new cpumask_ ops take a "struct cpumask *"; the old ones + * use cpumask_t. + * * See detailed comments in the file linux/bitmap.h describing the * data type on which these cpumasks are based. * @@ -31,7 +34,7 @@ * will span the entire range of NR_CPUS. * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . * - * The available cpumask operations are: + * The obsolescent cpumask operations are: * * void cpu_set(cpu, mask) turn on bit 'cpu' in mask * void cpu_clear(cpu, mask) turn off bit 'cpu' in mask @@ -138,7 +141,7 @@ #include #include -typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; +typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern cpumask_t _unused_cpumask_arg_; #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst)) @@ -527,4 +530,499 @@ extern cpumask_t cpu_active_map; #define for_each_online_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_online_map) #define for_each_present_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_present_map) +/* These are the new versions of the cpumask operators: passed by pointer. + * The older versions will be implemented in terms of these, then deleted. */ +#define cpumask_bits(maskp) ((maskp)->bits) + +#if NR_CPUS <= BITS_PER_LONG +#define CPU_BITS_ALL \ +{ \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} + +/* This produces more efficient code. */ +#define nr_cpumask_bits NR_CPUS + +#else /* NR_CPUS > BITS_PER_LONG */ + +#define CPU_BITS_ALL \ +{ \ + [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} + +#define nr_cpumask_bits nr_cpu_ids +#endif /* NR_CPUS > BITS_PER_LONG */ + +/* verify cpu argument to cpumask_* operators */ +static inline unsigned int cpumask_check(unsigned int cpu) +{ +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + WARN_ON_ONCE(cpu >= nr_cpumask_bits); +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ + return cpu; +} + +#if NR_CPUS == 1 +/* Uniprocesor. */ +#define cpumask_first(src) ({ (void)(src); 0; }) +#define cpumask_next(n, src) ({ (void)(src); 1; }) +#define cpumask_next_zero(n, src) ({ (void)(src); 1; }) +#define cpumask_next_and(n, srcp, andp) ({ (void)(srcp), (void)(andp); 1; }) +#define cpumask_any_but(mask, cpu) ({ (void)(mask); (void)(cpu); 0; }) + +#define for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define for_each_cpu_and(cpu, mask, and) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) +#else +/** + * cpumask_first - get the first cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no cpus set. + */ +static inline unsigned int cpumask_first(const struct cpumask *srcp) +{ + return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_next - get the next cpu in a cpumask + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus set. + */ +static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); +} + +/** + * cpumask_next_zero - get the next unset cpu in a cpumask + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus unset. + */ +static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); +} + +int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); +int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); + +#define for_each_cpu(cpu, mask) \ + for ((cpu) = -1; \ + (cpu) = cpumask_next((cpu), (mask)), \ + (cpu) < nr_cpu_ids;) +#define for_each_cpu_and(cpu, mask, and) \ + for ((cpu) = -1; \ + (cpu) = cpumask_next_and((cpu), (mask), (and)), \ + (cpu) < nr_cpu_ids;) +#endif /* SMP */ + +#define CPU_BITS_NONE \ +{ \ + [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ +} + +#define CPU_BITS_CPU0 \ +{ \ + [0] = 1UL \ +} + +/** + * cpumask_set_cpu - set a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @dstp: the cpumask pointer + */ +static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +/** + * cpumask_clear_cpu - clear a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @dstp: the cpumask pointer + */ +static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +/** + * cpumask_test_cpu - test for a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @cpumask: the cpumask pointer + * + * No static inline type checking - see Subtlety (1) above. + */ +#define cpumask_test_cpu(cpu, cpumask) \ + test_bit(cpumask_check(cpu), (cpumask)->bits) + +/** + * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @cpumask: the cpumask pointer + * + * test_and_set_bit wrapper for cpumasks. + */ +static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) +{ + return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); +} + +/** + * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask + * @dstp: the cpumask pointer + */ +static inline void cpumask_setall(struct cpumask *dstp) +{ + bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); +} + +/** + * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask + * @dstp: the cpumask pointer + */ +static inline void cpumask_clear(struct cpumask *dstp) +{ + bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); +} + +/** + * cpumask_and - *dstp = *src1p & *src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_and(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_or - *dstp = *src1p | *src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_xor - *dstp = *src1p ^ *src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_xor(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_andnot - *dstp = *src1p & ~*src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_andnot(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_complement - *dstp = ~*srcp + * @dstp: the cpumask result + * @srcp: the input to invert + */ +static inline void cpumask_complement(struct cpumask *dstp, + const struct cpumask *srcp) +{ + bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), + nr_cpumask_bits); +} + +/** + * cpumask_equal - *src1p == *src2p + * @src1p: the first input + * @src2p: the second input + */ +static inline bool cpumask_equal(const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits); +} + +/** + * cpumask_intersects - (*src1p & *src2p) != 0 + * @src1p: the first input + * @src2p: the second input + */ +static inline bool cpumask_intersects(const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits); +} + +/** + * cpumask_subset - (*src1p & ~*src2p) == 0 + * @src1p: the first input + * @src2p: the second input + */ +static inline int cpumask_subset(const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits); +} + +/** + * cpumask_empty - *srcp == 0 + * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. + */ +static inline bool cpumask_empty(const struct cpumask *srcp) +{ + return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_full - *srcp == 0xFFFFFFFF... + * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. + */ +static inline bool cpumask_full(const struct cpumask *srcp) +{ + return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_weight - Count of bits in *srcp + * @srcp: the cpumask to count bits (< nr_cpu_ids) in. + */ +static inline unsigned int cpumask_weight(const struct cpumask *srcp) +{ + return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_shift_right - *dstp = *srcp >> n + * @dstp: the cpumask result + * @srcp: the input to shift + * @n: the number of bits to shift by + */ +static inline void cpumask_shift_right(struct cpumask *dstp, + const struct cpumask *srcp, int n) +{ + bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, + nr_cpumask_bits); +} + +/** + * cpumask_shift_left - *dstp = *srcp << n + * @dstp: the cpumask result + * @srcp: the input to shift + * @n: the number of bits to shift by + */ +static inline void cpumask_shift_left(struct cpumask *dstp, + const struct cpumask *srcp, int n) +{ + bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, + nr_cpumask_bits); +} + +/** + * cpumask_copy - *dstp = *srcp + * @dstp: the result + * @srcp: the input cpumask + */ +static inline void cpumask_copy(struct cpumask *dstp, + const struct cpumask *srcp) +{ + bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_any - pick a "random" cpu from *srcp + * @srcp: the input cpumask + * + * Returns >= nr_cpu_ids if no cpus set. + */ +#define cpumask_any(srcp) cpumask_first(srcp) + +/** + * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 + * @src1p: the first input + * @src2p: the second input + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) + +/** + * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 + * @mask1: the first input cpumask + * @mask2: the second input cpumask + * + * Returns >= nr_cpu_ids if no cpus set. + */ +#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) + +/** + * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * + * @bitmap: the bitmap + * + * There are a few places where cpumask_var_t isn't appropriate and + * static cpumasks must be used (eg. very early boot), yet we don't + * expose the definition of 'struct cpumask'. + * + * This does the conversion, and can be used as a constant initializer. + */ +#define to_cpumask(bitmap) \ + ((struct cpumask *)(1 ? (bitmap) \ + : (void *)sizeof(__check_is_bitmap(bitmap)))) + +static inline int __check_is_bitmap(const unsigned long *bitmap) +{ + return 1; +} + +/** + * cpumask_size - size to allocate for a 'struct cpumask' in bytes + * + * This will eventually be a runtime variable, depending on nr_cpu_ids. + */ +static inline size_t cpumask_size(void) +{ + /* FIXME: Once all cpumask assignments are eliminated, this + * can be nr_cpumask_bits */ + return BITS_TO_LONGS(NR_CPUS) * sizeof(long); +} + +/* + * cpumask_var_t: struct cpumask for stack usage. + * + * Oh, the wicked games we play! In order to make kernel coding a + * little more difficult, we typedef cpumask_var_t to an array or a + * pointer: doing &mask on an array is a noop, so it still works. + * + * ie. + * cpumask_var_t tmpmask; + * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + * return -ENOMEM; + * + * ... use 'tmpmask' like a normal struct cpumask * ... + * + * free_cpumask_var(tmpmask); + */ +#ifdef CONFIG_CPUMASK_OFFSTACK +typedef struct cpumask *cpumask_var_t; + +bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); +void alloc_bootmem_cpumask_var(cpumask_var_t *mask); +void free_cpumask_var(cpumask_var_t mask); + +#else +typedef struct cpumask cpumask_var_t[1]; + +static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + return true; +} + +static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) +{ +} + +static inline void free_cpumask_var(cpumask_var_t mask) +{ +} +#endif /* CONFIG_CPUMASK_OFFSTACK */ + +/* The pointer versions of the maps, these will become the primary versions. */ +#define cpu_possible_mask ((const struct cpumask *)&cpu_possible_map) +#define cpu_online_mask ((const struct cpumask *)&cpu_online_map) +#define cpu_present_mask ((const struct cpumask *)&cpu_present_map) +#define cpu_active_mask ((const struct cpumask *)&cpu_active_map) + +/* It's common to want to use cpu_all_mask in struct member initializers, + * so it has to refer to an address rather than a pointer. */ +extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); +#define cpu_all_mask to_cpumask(cpu_all_bits) + +/* First bits of cpu_bit_bitmap are in fact unset. */ +#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) + +/* Wrappers for arch boot code to manipulate normally-constant masks */ +static inline void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, &cpu_possible_map); + else + cpumask_clear_cpu(cpu, &cpu_possible_map); +} + +static inline void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, &cpu_present_map); + else + cpumask_clear_cpu(cpu, &cpu_present_map); +} + +static inline void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, &cpu_online_map); + else + cpumask_clear_cpu(cpu, &cpu_online_map); +} + +static inline void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, &cpu_active_map); + else + cpumask_clear_cpu(cpu, &cpu_active_map); +} + +static inline void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(&cpu_present_map, src); +} + +static inline void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(&cpu_possible_map, src); +} + +static inline void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(&cpu_online_map, src); +} #endif /* __LINUX_CPUMASK_H */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 2e4d58b26c0..3f9a60043a9 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -64,8 +64,17 @@ extern void smp_cpus_done(unsigned int max_cpus); * Call a function on all other processors */ int smp_call_function(void(*func)(void *info), void *info, int wait); +/* Deprecated: use smp_call_function_many() which uses a cpumask ptr. */ int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, int wait); + +static inline void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *info), void *info, + int wait) +{ + smp_call_function_mask(*mask, func, info, wait); +} + int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int wait); void __smp_call_function_single(int cpuid, struct call_single_data *data); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 89a5a1231ff..b36291130f2 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -240,4 +240,12 @@ void cancel_rearming_delayed_work(struct delayed_work *work) cancel_delayed_work_sync(work); } +#ifndef CONFIG_SMP +static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + return fn(arg); +} +#else +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); +#endif /* CONFIG_SMP */ #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 86d49045dae..5a732c5ef08 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { #endif }; EXPORT_SYMBOL_GPL(cpu_bit_bitmap); + +const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; +EXPORT_SYMBOL(cpu_all_bits); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f928f2a87b9..d4dc69ddebd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -970,6 +970,51 @@ undo: return ret; } +#ifdef CONFIG_SMP +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void do_work_for_cpu(struct work_struct *w) +{ + struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return -EINVAL in the cpu is not online, or the return value + * of @fn otherwise. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc; + + INIT_WORK(&wfc.work, do_work_for_cpu); + wfc.fn = fn; + wfc.arg = arg; + get_online_cpus(); + if (unlikely(!cpu_online(cpu))) + wfc.ret = -EINVAL; + else { + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + } + put_online_cpus(); + + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + void __init init_workqueues(void) { cpu_populated_map = cpu_online_map; diff --git a/lib/cpumask.c b/lib/cpumask.c index 5f97dc25ef9..5ceb4211c83 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -2,6 +2,7 @@ #include #include #include +#include int __first_cpu(const cpumask_t *srcp) { @@ -35,3 +36,75 @@ int __any_online_cpu(const cpumask_t *mask) return cpu; } EXPORT_SYMBOL(__any_online_cpu); + +/** + * cpumask_next_and - get the next cpu in *src1p & *src2p + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus set in both. + */ +int cpumask_next_and(int n, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) + if (cpumask_test_cpu(n, src2p)) + break; + return n; +} +EXPORT_SYMBOL(cpumask_next_and); + +/** + * cpumask_any_but - return a "random" in a cpumask, but not this one. + * @mask: the cpumask to search + * @cpu: the cpu to ignore. + * + * Often used to find any cpu but smp_processor_id() in a mask. + * Returns >= nr_cpu_ids if no cpus set. + */ +int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) +{ + unsigned int i; + + for_each_cpu(i, mask) + if (i != cpu) + break; + return i; +} + +/* These are not inline because of header tangles. */ +#ifdef CONFIG_CPUMASK_OFFSTACK +bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + if (likely(slab_is_available())) + *mask = kmalloc(cpumask_size(), flags); + else { +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + printk(KERN_ERR + "=> alloc_cpumask_var: kmalloc not available!\n"); + dump_stack(); +#endif + *mask = NULL; + } +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (!*mask) { + printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); + dump_stack(); + } +#endif + return *mask != NULL; +} +EXPORT_SYMBOL(alloc_cpumask_var); + +void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) +{ + *mask = alloc_bootmem(cpumask_size()); +} + +void free_cpumask_var(cpumask_var_t mask) +{ + kfree(mask); +} +EXPORT_SYMBOL(free_cpumask_var); +#endif -- cgit v1.2.3-70-g09d2 From 5ac5c4d604bf894ef672a7971d03fefdc7ea7e49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2008 10:46:32 +0100 Subject: sched: clean up debug info Impact: clean up and fix debug info printout While looking over the sched_debug code I noticed that we printed the rq schedstats for every cfs_rq, ammend this. Also change nr_spead_over into an int, and fix a little buglet in min_vruntime printing. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_debug.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 57c933ffbee..f3149244e32 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -399,7 +399,7 @@ struct cfs_rq { */ struct sched_entity *curr, *next, *last; - unsigned long nr_spread_over; + unsigned int nr_spread_over; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae17762ec3..48ecc51e770 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; - min_vruntime = rq->cfs.min_vruntime; + min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", @@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - - P(yld_exp_empty); - P(yld_act_empty); - P(yld_both_empty); - P(yld_count); - P(sched_switch); - P(sched_count); - P(sched_goidle); - - P(ttwu_count); - P(ttwu_local); - - P(bkl_count); - -#undef P -#endif - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #undef PN +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P +#endif print_cfs_stats(m, cpu); print_rt_stats(m, cpu); -- cgit v1.2.3-70-g09d2 From ee5f80a993539490a07477ff2526bf62c503fbb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 11:06:00 +0100 Subject: irq: call __irq_enter() before calling the tick_idle_check Impact: avoid spurious ksoftirqd wakeups The tick idle check which is called from irq_enter() was run before the call to __irq_enter() which did not set the in_interrupt() bits in preempt_count. That way the raise of a softirq woke up softirqd for nothing as the softirq was handled on return from interrupt. Call __irq_enter() before calling into the tick idle check code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/softirq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 7110daeb9a9..e7c69a720d6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -269,10 +269,11 @@ void irq_enter(void) { int cpu = smp_processor_id(); - if (idle_cpu(cpu) && !in_interrupt()) + if (idle_cpu(cpu) && !in_interrupt()) { + __irq_enter(); tick_check_idle(cpu); - - __irq_enter(); + } else + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -- cgit v1.2.3-70-g09d2 From ae99286b4f1be7788f2d6947c66a91dbd6351eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Nov 2008 13:20:23 +0100 Subject: nohz: disable tick_nohz_kick_tick() for now Impact: nohz powersavings and wakeup regression commit fb02fbc14d17837b4b7b02dbb36142c16a7bf208 (NOHZ: restart tick device from irq_enter()) causes a serious wakeup regression. While the patch is correct it does not take into account that spurious wakeups happen on x86. A fix for this issue is available, but we just revert to the .27 behaviour and let long running softirqs screw themself. Disable it for now. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5bbb1044f84..342fc9ccab4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void) */ static void tick_nohz_kick_tick(int cpu) { +#if 0 + /* Switch back to 2.6.27 behaviour */ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta, now; @@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu) return; tick_nohz_restart(ts, now); +#endif } #else -- cgit v1.2.3-70-g09d2 From bf5e6519b85b3853f2d0bb4f17a4e2eaeffeb574 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 21:46:00 -0500 Subject: ftrace: disable tracing on resize Impact: fix for bug on resize This patch addresses the bug found here: http://bugzilla.kernel.org/show_bug.cgi?id=11996 When ftrace converted to the new unified trace buffer, the resizing of the buffer was not protected as much as it was originally. If tracing is performed while the resize occurs, then the buffer can be corrupted. This patch disables all ftrace buffer modifications before a resize takes place. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478f917..abfa8103d04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2676,7 +2676,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret; + int ret, cpu; struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) @@ -2704,6 +2704,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, goto out; } + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); if (ret < 0) { @@ -2735,6 +2743,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) cnt = -ENOMEM; out: + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); -- cgit v1.2.3-70-g09d2 From 4143c5cb36331155a1823af8b3a8c761a59fed71 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 21:46:01 -0500 Subject: ring-buffer: prevent infinite looping on time stamping Impact: removal of unnecessary looping The lockless part of the ring buffer allows for reentry into the code from interrupts. A timestamp is taken, a test is preformed and if it detects that an interrupt occurred that did tracing, it tries again. The problem arises if the timestamp code itself causes a trace. The detection will detect this and loop again. The difference between this and an interrupt doing tracing, is that this will fail every time, and cause an infinite loop. Currently, we test if the loop happens 1000 times, and if so, it will produce a warning and disable the ring buffer. The problem with this approach is that it makes it difficult to perform some types of tracing (tracing the timestamp code itself). Each trace entry has a delta timestamp from the previous entry. If a trace entry is reserved but and interrupt occurs and traces before the previous entry is commited, the delta timestamp for that entry will be zero. This actually makes sense in terms of tracing, because the interrupt entry happened before the preempted entry was commited, so one may consider the two happening at the same time. The order is still preserved in the buffer. With this idea, instead of trying to get a new timestamp if an interrupt made it in between the timestamp and the test, the entry could simply make the delta zero and continue. This will prevent interrupts or tracers in the timer code from causing the above loop. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3f338063864..2f76193c348 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1060,7 +1060,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - goto again; + delta = 0; if (test_time_stamp(delta)) { -- cgit v1.2.3-70-g09d2 From ad474caca3e2a0550b7ce0706527ad5ab389a4d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 10 Nov 2008 15:39:30 +0100 Subject: fix for account_group_exec_runtime(), make sure ->signal can't be freed under rq->lock Impact: fix hang/crash on ia64 under high load This is ugly, but the simplest patch by far. Unlike other similar routines, account_group_exec_runtime() could be called "implicitly" from within scheduler after exit_notify(). This means we can race with the parent doing release_task(), we can't just check ->signal != NULL. Change __exit_signal() to do spin_unlock_wait(&task_rq(tsk)->lock) before __cleanup_signal() to make sure ->signal can't be freed under task_rq(tsk)->lock. Note that task_rq_unlock_wait() doesn't care about the case when tsk changes cpu/rq under us, this should be OK. Thanks to Ingo who nacked my previous buggy patch. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar Reported-by: Doug Chapman --- include/linux/sched.h | 1 + kernel/exit.c | 5 +++++ kernel/sched.c | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 295b7c756ca..644ffbda17c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -247,6 +247,7 @@ extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); extern int runqueue_is_locked(void); +extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) diff --git a/kernel/exit.c b/kernel/exit.c index 80137a5d946..ae2b92be5fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -141,6 +141,11 @@ static void __exit_signal(struct task_struct *tsk) if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } diff --git a/kernel/sched.c b/kernel/sched.c index f3149244e32..50a21f96467 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -969,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void task_rq_unlock_wait(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ + spin_unlock_wait(&rq->lock); +} + static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { -- cgit v1.2.3-70-g09d2 From 5d5254f0d3b9bebc47d97e357374c0ad0c291a7d Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sat, 25 Oct 2008 10:22:38 +0530 Subject: timers: handle HRTIMER_CB_IRQSAFE_UNLOCKED correctly from softirq context Impact: fix incorrect locking triggered during hotplug-intense stress-tests While migrating the the CB_IRQSAFE_UNLOCKED timers during a cpu-offline, we queue them on the cb_pending list, so that they won't go stale. Thus, when the callbacks of the timers run from the softirq context, they could run into potential deadlocks, since these callbacks assume that they're running with irq's disabled, thereby annoying lockdep! Fix this by emulating hardirq context while running these callbacks from the hrtimer softirq. ================================= [ INFO: inconsistent lock state ] 2.6.27 #2 -------------------------------- inconsistent {in-hardirq-W} -> {hardirq-on-W} usage. ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes: (&rq->lock){++..}, at: [] sched_rt_period_timer+0x9e/0x1fc {in-hardirq-W} state was registered at: [] __lock_acquire+0x549/0x121e [] native_sched_clock+0x88/0x99 [] clocksource_get_next+0x39/0x3f [] update_wall_time+0x616/0x7df [] lock_acquire+0x5a/0x74 [] scheduler_tick+0x3a/0x18d [] _spin_lock+0x1c/0x45 [] scheduler_tick+0x3a/0x18d [] scheduler_tick+0x3a/0x18d [] update_process_times+0x3a/0x44 [] tick_periodic+0x63/0x6d [] tick_handle_periodic+0x14/0x5e [] timer_interrupt+0x44/0x4a [] handle_IRQ_event+0x13/0x3d [] handle_level_irq+0x79/0xbd [] do_IRQ+0x69/0x7d [] common_interrupt+0x28/0x30 [] aac_probe_one+0x1a3/0x3f3 [] _spin_unlock_irqrestore+0x36/0x39 [] setup_irq+0x1be/0x1f9 [] start_kernel+0x259/0x2c5 [] 0xffffffff irq event stamp: 50102 hardirqs last enabled at (50102): [] _spin_unlock_irq+0x20/0x23 hardirqs last disabled at (50101): [] _spin_lock_irq+0xa/0x4b softirqs last enabled at (50088): [] do_softirq+0x37/0x4d softirqs last disabled at (50099): [] do_softirq+0x37/0x4d other info that might help us debug this: no locks held by ksoftirqd/0/4. stack backtrace: Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27 #2 [] print_usage_bug+0x13e/0x147 [] mark_lock+0x493/0x797 [] __lock_acquire+0x5be/0x121e [] lock_acquire+0x5a/0x74 [] sched_rt_period_timer+0x9e/0x1fc [] _spin_lock+0x1c/0x45 [] sched_rt_period_timer+0x9e/0x1fc [] sched_rt_period_timer+0x9e/0x1fc [] finish_task_switch+0x41/0xbd [] native_sched_clock+0x88/0x99 [] sched_rt_period_timer+0x0/0x1fc [] run_hrtimer_pending+0x54/0xe5 [] sched_rt_period_timer+0x0/0x1fc [] __do_softirq+0x7b/0xef [] do_softirq+0x37/0x4d [] ksoftirqd+0x56/0xc5 [] ksoftirqd+0x0/0xc5 [] kthread+0x38/0x5d [] kthread+0x0/0x5d [] kernel_thread_helper+0x7/0x10 ======================= Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Acked-by: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2b465dfde42..95d3949f2ae 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1209,6 +1209,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) enum hrtimer_restart (*fn)(struct hrtimer *); struct hrtimer *timer; int restart; + int emulate_hardirq_ctx = 0; timer = list_entry(cpu_base->cb_pending.next, struct hrtimer, cb_entry); @@ -1217,10 +1218,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) timer_stats_account_hrtimer(timer); fn = timer->function; + /* + * A timer might have been added to the cb_pending list + * when it was migrated during a cpu-offline operation. + * Emulate hardirq context for such timers. + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) + emulate_hardirq_ctx = 1; + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); spin_unlock_irq(&cpu_base->lock); - restart = fn(timer); + if (unlikely(emulate_hardirq_ctx)) { + local_irq_disable(); + restart = fn(timer); + local_irq_enable(); + } else + restart = fn(timer); spin_lock_irq(&cpu_base->lock); -- cgit v1.2.3-70-g09d2 From 2002c69595a092518107f7e3c1294c9710bc92ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Nov 2008 11:52:33 +0100 Subject: sched: release buddies on yield Clear buddies on yield, so that the buddy rules don't schedule them despite them being placed right-most. This fixed a performance regression with yield-happy binary JVMs. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Tested-by: Lin Ming --- kernel/sched_fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 51aa3e102ac..98345e45b05 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -716,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->last == se) + cfs_rq->last = NULL; + + if (cfs_rq->next == se) + cfs_rq->next = NULL; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -738,11 +747,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) #endif } - if (cfs_rq->last == se) - cfs_rq->last = NULL; - - if (cfs_rq->next == se) - cfs_rq->next = NULL; + clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); @@ -977,6 +982,8 @@ static void yield_task_fair(struct rq *rq) if (unlikely(cfs_rq->nr_running == 1)) return; + clear_buddies(cfs_rq, se); + if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* -- cgit v1.2.3-70-g09d2 From 621a0d5207c18012cb39932f2d9830a11a6cb03d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Nov 2008 09:36:35 +0100 Subject: hrtimer: clean up unused callback modes Impact: cleanup git grep HRTIMER_CB_IRQSAFE revealed half the callback modes are actually unused. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 5 ----- kernel/hrtimer.c | 9 --------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 07e510a3b00..3eba43878dc 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -46,9 +46,6 @@ enum hrtimer_restart { * hrtimer callback modes: * * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context - * HRTIMER_CB_IRQSAFE: Callback may run in hardirq context - * HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and - * does not restart the timer * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context * Special mode for tick emulation and * scheduler timer. Such timers are per @@ -61,8 +58,6 @@ enum hrtimer_restart { */ enum hrtimer_cb_mode { HRTIMER_CB_SOFTIRQ, - HRTIMER_CB_IRQSAFE, - HRTIMER_CB_IRQSAFE_NO_RESTART, HRTIMER_CB_IRQSAFE_PERCPU, HRTIMER_CB_IRQSAFE_UNLOCKED, }; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 95d3949f2ae..47e63349d1b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* Timer is expired, act upon the callback mode */ switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_NO_RESTART: - debug_hrtimer_deactivate(timer); - /* - * We can call the callback from here. No restart - * happens, so no danger of recursion - */ - BUG_ON(timer->function(timer) != HRTIMER_NORESTART); - return 1; case HRTIMER_CB_IRQSAFE_PERCPU: case HRTIMER_CB_IRQSAFE_UNLOCKED: /* @@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ debug_hrtimer_deactivate(timer); return 1; - case HRTIMER_CB_IRQSAFE: case HRTIMER_CB_SOFTIRQ: /* * Move everything else into the softirq pending list ! -- cgit v1.2.3-70-g09d2 From a2d477778e82a60a0b7114cefdb70aa43af28782 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 12 Nov 2008 16:19:00 +0530 Subject: sched: fix stale value in average load per task Impact: fix load balancer load average calculation accuracy cpu_avg_load_per_task() returns a stale value when nr_running is 0. It returns an older stale (caculated when nr_running was non zero) value. This patch returns and sets rq->avg_load_per_task to zero when nr_running is 0. Compile and boot tested on a x86_64 box. Signed-off-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 50a21f96467..3bafbe350f4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1456,6 +1456,8 @@ static unsigned long cpu_avg_load_per_task(int cpu) if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; + else + rq->avg_load_per_task = 0; return rq->avg_load_per_task; } -- cgit v1.2.3-70-g09d2 From 5cbd54ef470d880fc37fbe4b21eb514806d51e0d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 20:05:50 +0100 Subject: sched: fix init_idle()'s use of sched_clock() Maciej Rutecki reported: > I have this bug during suspend to disk: > > [ 188.592151] Enabling non-boot CPUs ... > [ 188.592151] SMP alternatives: switching to SMP code > [ 188.666058] BUG: using smp_processor_id() in preemptible > [00000000] > code: suspend_to_disk/2934 > [ 188.666064] caller is native_sched_clock+0x2b/0x80 Which, as noted by Linus, was caused by me, via: 7cbaef9c "sched: optimize sched_clock() a bit" Move the rq locking a bit earlier in the initialization sequence, that will make the sched_clock() call in init_idle() non-preemptible. Reported-by: Maciej Rutecki Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3bafbe350f4..c94baf2969e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + spin_lock_irqsave(&rq->lock, flags); + __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -5877,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->cpus_allowed = cpumask_of_cpu(cpu); __set_task_cpu(idle, cpu); - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; -- cgit v1.2.3-70-g09d2 From 687446760bd008df96655cb8c5900f8e48a7118c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 12 Nov 2008 13:26:49 -0800 Subject: freezer_cg: remove task_lock from freezer_fork() In theory the task can be moved to another cgroup and the freezer will be freed right after task_lock is dropped, so the lock results in zero protection. But in the case of freezer_fork() no lock is needed, since the task is not in tasklist yet so it won't be moved to another cgroup, so task->cgroups won't be changed or invalidated. Signed-off-by: Li Zefan Cc: Matt Helsley Cc: Cedric Le Goater Cc: "Serge E. Hallyn" Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7fa476f01d0..66059071040 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -184,9 +184,13 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; - task_lock(task); + /* + * No lock is needed, since the task isn't on tasklist yet, + * so it can't be moved to another cgroup, which means the + * freezer won't be removed and will be valid during this + * function call. + */ freezer = task_freezer(task); - task_unlock(task); spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); -- cgit v1.2.3-70-g09d2 From 3b1b3f6e57064aa8f91c290fe51cda4c74642902 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 12 Nov 2008 13:26:50 -0800 Subject: freezer_cg: disable writing freezer.state of root cgroup With this change, control file 'freezer.state' doesn't exist in root cgroup, making root cgroup unfreezable. I think it's reasonable to disallow freeze tasks in the root cgroup. And then we can avoid fork overhead when freezer subsystem is compiled but not used. Also make writing invalid value to freezer.state returns EINVAL rather than EIO. This is more consistent with other cgroup subsystem. Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Cedric Le Goater Cc: Paul Menage Cc: Matt Helsley Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/freezer-subsystem.txt | 21 ++++++++++++--------- kernel/cgroup_freezer.c | 11 ++++++++++- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt index c50ab58b72e..41f37fea127 100644 --- a/Documentation/cgroups/freezer-subsystem.txt +++ b/Documentation/cgroups/freezer-subsystem.txt @@ -1,4 +1,4 @@ - The cgroup freezer is useful to batch job management system which start +The cgroup freezer is useful to batch job management system which start and stop sets of tasks in order to schedule the resources of a machine according to the desires of a system administrator. This sort of program is often used on HPC clusters to schedule access to the cluster as a @@ -6,7 +6,7 @@ whole. The cgroup freezer uses cgroups to describe the set of tasks to be started/stopped by the batch job management system. It also provides a means to start and stop the tasks composing the job. - The cgroup freezer will also be useful for checkpointing running groups +The cgroup freezer will also be useful for checkpointing running groups of tasks. The freezer allows the checkpoint code to obtain a consistent image of the tasks by attempting to force the tasks in a cgroup into a quiescent state. Once the tasks are quiescent another task can @@ -16,7 +16,7 @@ recoverable error occur. This also allows the checkpointed tasks to be migrated between nodes in a cluster by copying the gathered information to another node and restarting the tasks there. - Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping and resuming tasks in userspace. Both of these signals are observable from within the tasks we wish to freeze. While SIGSTOP cannot be caught, blocked, or ignored it can be seen by waiting or ptracing parent tasks. @@ -37,26 +37,29 @@ demonstrate this problem using nested bash shells: - This happens because bash can observe both signals and choose how it +This happens because bash can observe both signals and choose how it responds to them. - Another example of a program which catches and responds to these +Another example of a program which catches and responds to these signals is gdb. In fact any program designed to use ptrace is likely to have a problem with this method of stopping and resuming tasks. - In contrast, the cgroup freezer uses the kernel freezer code to +In contrast, the cgroup freezer uses the kernel freezer code to prevent the freeze/unfreeze cycle from becoming visible to the tasks being frozen. This allows the bash example above and gdb to run as expected. - The freezer subsystem in the container filesystem defines a file named +The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. Reading will return the current state. +Note freezer.state doesn't exist in root cgroup, which means root cgroup +is non-freezable. + * Examples of usage : - # mkdir /containers/freezer + # mkdir /containers # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks @@ -94,6 +97,6 @@ things happens: the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal - and returns EIO) + and returns EINVAL) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 66059071040..fb249e2bcad 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -192,6 +192,13 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) */ freezer = task_freezer(task); + /* + * The root cgroup is non-freezable, so we can skip the + * following check. + */ + if (!freezer->css.cgroup->parent) + return; + spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); @@ -335,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup, else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) goal_state = CGROUP_FROZEN; else - return -EIO; + return -EINVAL; if (!cgroup_lock_live_group(cgroup)) return -ENODEV; @@ -354,6 +361,8 @@ static struct cftype files[] = { static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) { + if (!cgroup->parent) + return 0; return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); } -- cgit v1.2.3-70-g09d2 From a189d0350f387786b1fb5a5d19e3a5ab0bc0cceb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 12 Nov 2008 13:26:51 -0800 Subject: kprobes: disable preempt for module_text_address() and kernel_text_address() __register_kprobe() can be preempted after checking probing address but before module_text_address() or try_module_get(), and in this interval the module can be unloaded. In that case, try_module_get(probed_mod) will access to invalid address, or kprobe will probe invalid address. This patch uses preempt_disable() to protect it and uses __module_text_address() and __kernel_text_address(). Signed-off-by: Lai Jiangshan Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Hiroshi Shimamoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8b57a2597f2..f83c5e42fb0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = addr; - if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) + preempt_disable(); + if (!__kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 0; /* * Check if are we probing a module. */ - probed_mod = module_text_address((unsigned long) p->addr); + probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod = module_text_address(called_from); + struct module *calling_mod; + calling_mod = __module_text_address(called_from); /* * We must allow modules to probe themself and in this case * avoid incrementing the module refcount, so as to allow * unloading of self probing modules. */ if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 1; } else probed_mod = NULL; } + preempt_enable(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) struct kprobe *old_p; if (p->mod_refcounted) { + /* + * Since we've already incremented refcount, + * we don't need to disable preemption. + */ mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); -- cgit v1.2.3-70-g09d2 From 7e036d040a28bf95255d7eb9faf0ffbba3677e99 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Nov 2008 13:26:57 -0800 Subject: kernel/kprobes.c: don't pad kretprobe_table_locks[] on uniprocessor builds We only need the cacheline padding on SMP kernels. Saves 6k: text data bss dec hex filename 5713 388 8840 14941 3a5d kernel/kprobes.o 5713 388 2632 8733 221d kernel/kprobes.o Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f83c5e42fb0..9f8a3f25259 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,7 +72,7 @@ static bool kprobe_enabled; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned; + spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -- cgit v1.2.3-70-g09d2 From 8141c7f3e7aee618312fa1c15109e1219de784a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Nov 2008 10:20:36 -0800 Subject: Move "exit_robust_list" into mm_release() We don't want to get rid of the futexes just at exit() time, we want to drop them when doing an execve() too, since that gets rid of the previous VM image too. Doing it at mm_release() time means that we automatically always do it when we disassociate a VM map from the task. Reported-by: pageexec@freemail.hu Cc: Andrew Morton Cc: Nick Piggin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Brad Spengler Cc: Alex Efros Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 --------- kernel/fork.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92be5fa..2d8be7ebb0f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include /* for audit_free() */ #include @@ -1059,14 +1058,6 @@ NORET_TYPE void do_exit(long code) exit_itimers(tsk->signal); } acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) diff --git a/kernel/fork.c b/kernel/fork.c index f6083561dfe..2a372a0e206 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -519,6 +520,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif +#endif + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); -- cgit v1.2.3-70-g09d2 From 8f7b0ba1c853919b85b54774775f567f30006107 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 15 Nov 2008 01:15:43 +0000 Subject: Fix inotify watch removal/umount races Inotify watch removals suck violently. To kick the watch out we need (in this order) inode->inotify_mutex and ih->mutex. That's fine if we have a hold on inode; however, for all other cases we need to make damn sure we don't race with umount. We can *NOT* just grab a reference to a watch - inotify_unmount_inodes() will happily sail past it and we'll end with reference to inode potentially outliving its superblock. Ideally we just want to grab an active reference to superblock if we can; that will make sure we won't go into inotify_umount_inodes() until we are done. Cleanup is just deactivate_super(). However, that leaves a messy case - what if we *are* racing with umount() and active references to superblock can't be acquired anymore? We can bump ->s_count, grab ->s_umount, which will almost certainly wait until the superblock is shut down and the watch in question is pining for fjords. That's fine, but there is a problem - we might have hit the window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock is past the point of no return and is heading for shutdown) and the moment when deactivate_super() acquires ->s_umount. We could just do drop_super() yield() and retry, but that's rather antisocial and this stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having found that we'd got there first (i.e. that ->s_root is non-NULL) we know that we won't race with inotify_umount_inodes(). So we could grab a reference to watch and do the rest as above, just with drop_super() instead of deactivate_super(), right? Wrong. We had to drop ih->mutex before we could grab ->s_umount. So the watch could've been gone already. That still can be dealt with - we need to save watch->wd, do idr_find() and compare its result with our pointer. If they match, we either have the damn thing still alive or we'd lost not one but two races at once, the watch had been killed and a new one got created with the same ->wd at the same address. That couldn't have happened in inotify_destroy(), but inotify_rm_wd() could run into that. Still, "new one got created" is not a problem - we have every right to kill it or leave it alone, whatever's more convenient. So we can use idr_find(...) == watch && watch->inode->i_sb == sb as "grab it and kill it" check. If it's been our original watch, we are fine, if it's a newcomer - nevermind, just pretend that we'd won the race and kill the fscker anyway; we are safe since we know that its superblock won't be going away. And yes, this is far beyond mere "not very pretty"; so's the entire concept of inotify to start with. Signed-off-by: Al Viro Acked-by: Greg KH Signed-off-by: Linus Torvalds --- fs/inotify.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/inotify.h | 11 ++++ kernel/audit_tree.c | 91 +++++++++++++++++------------ kernel/auditfilter.c | 14 +++-- 4 files changed, 218 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/fs/inotify.c b/fs/inotify.c index 690e72595e6..7bbed1b8982 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch) } EXPORT_SYMBOL_GPL(get_inotify_watch); +int pin_inotify_watch(struct inotify_watch *watch) +{ + struct super_block *sb = watch->inode->i_sb; + spin_lock(&sb_lock); + if (sb->s_count >= S_BIAS) { + atomic_inc(&sb->s_active); + spin_unlock(&sb_lock); + atomic_inc(&watch->count); + return 1; + } + spin_unlock(&sb_lock); + return 0; +} + /** * put_inotify_watch - decrements the ref count on a given watch. cleans up * watch references if the count reaches zero. inotify_watch is freed by @@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch) } EXPORT_SYMBOL_GPL(put_inotify_watch); +void unpin_inotify_watch(struct inotify_watch *watch) +{ + struct super_block *sb = watch->inode->i_sb; + put_inotify_watch(watch); + deactivate_super(sb); +} + /* * inotify_handle_get_wd - returns the next WD for use by the given handle * @@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch) } EXPORT_SYMBOL_GPL(inotify_init_watch); +/* + * Watch removals suck violently. To kick the watch out we need (in this + * order) inode->inotify_mutex and ih->mutex. That's fine if we have + * a hold on inode; however, for all other cases we need to make damn sure + * we don't race with umount. We can *NOT* just grab a reference to a + * watch - inotify_unmount_inodes() will happily sail past it and we'll end + * with reference to inode potentially outliving its superblock. Ideally + * we just want to grab an active reference to superblock if we can; that + * will make sure we won't go into inotify_umount_inodes() until we are + * done. Cleanup is just deactivate_super(). However, that leaves a messy + * case - what if we *are* racing with umount() and active references to + * superblock can't be acquired anymore? We can bump ->s_count, grab + * ->s_umount, which will almost certainly wait until the superblock is shut + * down and the watch in question is pining for fjords. That's fine, but + * there is a problem - we might have hit the window between ->s_active + * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock + * is past the point of no return and is heading for shutdown) and the + * moment when deactivate_super() acquires ->s_umount. We could just do + * drop_super() yield() and retry, but that's rather antisocial and this + * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having + * found that we'd got there first (i.e. that ->s_root is non-NULL) we know + * that we won't race with inotify_umount_inodes(). So we could grab a + * reference to watch and do the rest as above, just with drop_super() instead + * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we + * could grab ->s_umount. So the watch could've been gone already. + * + * That still can be dealt with - we need to save watch->wd, do idr_find() + * and compare its result with our pointer. If they match, we either have + * the damn thing still alive or we'd lost not one but two races at once, + * the watch had been killed and a new one got created with the same ->wd + * at the same address. That couldn't have happened in inotify_destroy(), + * but inotify_rm_wd() could run into that. Still, "new one got created" + * is not a problem - we have every right to kill it or leave it alone, + * whatever's more convenient. + * + * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as + * "grab it and kill it" check. If it's been our original watch, we are + * fine, if it's a newcomer - nevermind, just pretend that we'd won the + * race and kill the fscker anyway; we are safe since we know that its + * superblock won't be going away. + * + * And yes, this is far beyond mere "not very pretty"; so's the entire + * concept of inotify to start with. + */ + +/** + * pin_to_kill - pin the watch down for removal + * @ih: inotify handle + * @watch: watch to kill + * + * Called with ih->mutex held, drops it. Possible return values: + * 0 - nothing to do, it has died + * 1 - remove it, drop the reference and deactivate_super() + * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid + * that variant, since it involved a lot of PITA, but that's the best that + * could've been done. + */ +static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) +{ + struct super_block *sb = watch->inode->i_sb; + s32 wd = watch->wd; + + spin_lock(&sb_lock); + if (sb->s_count >= S_BIAS) { + atomic_inc(&sb->s_active); + spin_unlock(&sb_lock); + get_inotify_watch(watch); + mutex_unlock(&ih->mutex); + return 1; /* the best outcome */ + } + sb->s_count++; + spin_unlock(&sb_lock); + mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ + down_read(&sb->s_umount); + if (likely(!sb->s_root)) { + /* fs is already shut down; the watch is dead */ + drop_super(sb); + return 0; + } + /* raced with the final deactivate_super() */ + mutex_lock(&ih->mutex); + if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { + /* the watch is dead */ + mutex_unlock(&ih->mutex); + drop_super(sb); + return 0; + } + /* still alive or freed and reused with the same sb and wd; kill */ + get_inotify_watch(watch); + mutex_unlock(&ih->mutex); + return 2; +} + +static void unpin_and_kill(struct inotify_watch *watch, int how) +{ + struct super_block *sb = watch->inode->i_sb; + put_inotify_watch(watch); + switch (how) { + case 1: + deactivate_super(sb); + break; + case 2: + drop_super(sb); + } +} + /** * inotify_destroy - clean up and destroy an inotify instance * @ih: inotify handle @@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih) * pretty. We cannot do a simple iteration over the list, because we * do not know the inode until we iterate to the watch. But we need to * hold inode->inotify_mutex before ih->mutex. The following works. + * + * AV: it had to become even uglier to start working ;-/ */ while (1) { struct inotify_watch *watch; struct list_head *watches; + struct super_block *sb; struct inode *inode; + int how; mutex_lock(&ih->mutex); watches = &ih->watches; @@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih) break; } watch = list_first_entry(watches, struct inotify_watch, h_list); - get_inotify_watch(watch); - mutex_unlock(&ih->mutex); + sb = watch->inode->i_sb; + how = pin_to_kill(ih, watch); + if (!how) + continue; inode = watch->inode; mutex_lock(&inode->inotify_mutex); @@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(watch); + unpin_and_kill(watch, how); } /* free this handle: the put matching the get in inotify_init() */ @@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch) int inotify_rm_wd(struct inotify_handle *ih, u32 wd) { struct inotify_watch *watch; + struct super_block *sb; struct inode *inode; + int how; mutex_lock(&ih->mutex); watch = idr_find(&ih->idr, wd); @@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); return -EINVAL; } - get_inotify_watch(watch); + sb = watch->inode->i_sb; + how = pin_to_kill(ih, watch); + if (!how) + return 0; + inode = watch->inode; - mutex_unlock(&ih->mutex); mutex_lock(&inode->inotify_mutex); mutex_lock(&ih->mutex); @@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(watch); + unpin_and_kill(watch, how); return 0; } diff --git a/include/linux/inotify.h b/include/linux/inotify.h index bd578578a8b..37ea2894b3c 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -134,6 +134,8 @@ extern void inotify_remove_watch_locked(struct inotify_handle *, struct inotify_watch *); extern void get_inotify_watch(struct inotify_watch *); extern void put_inotify_watch(struct inotify_watch *); +extern int pin_inotify_watch(struct inotify_watch *); +extern void unpin_inotify_watch(struct inotify_watch *); #else @@ -228,6 +230,15 @@ static inline void put_inotify_watch(struct inotify_watch *watch) { } +extern inline int pin_inotify_watch(struct inotify_watch *watch) +{ + return 0; +} + +extern inline void unpin_inotify_watch(struct inotify_watch *watch) +{ +} + #endif /* CONFIG_INOTIFY */ #endif /* __KERNEL __ */ diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8ba0e0d934f..8b509441f49 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -24,6 +24,7 @@ struct audit_chunk { struct list_head trees; /* with root here */ int dead; int count; + atomic_long_t refs; struct rcu_head head; struct node { struct list_head list; @@ -56,7 +57,8 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch. + * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. * MSB of that sucker is stolen to mark taggings that we might have to @@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->hash); INIT_LIST_HEAD(&chunk->trees); chunk->count = count; + atomic_long_set(&chunk->refs, 1); for (i = 0; i < count; i++) { INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; @@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count) return chunk; } -static void __free_chunk(struct rcu_head *rcu) +static void free_chunk(struct audit_chunk *chunk) { - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); int i; for (i = 0; i < chunk->count; i++) { @@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu) kfree(chunk); } -static inline void free_chunk(struct audit_chunk *chunk) +void audit_put_chunk(struct audit_chunk *chunk) { - call_rcu(&chunk->head, __free_chunk); + if (atomic_long_dec_and_test(&chunk->refs)) + free_chunk(chunk); } -void audit_put_chunk(struct audit_chunk *chunk) +static void __put_chunk(struct rcu_head *rcu) { - put_inotify_watch(&chunk->watch); + struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); + audit_put_chunk(chunk); } enum {HASH_SIZE = 128}; @@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { - get_inotify_watch(&p->watch); + atomic_long_inc(&p->refs); return p; } } @@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) /* tagging and untagging inodes with trees */ -static void untag_chunk(struct audit_chunk *chunk, struct node *p) +static struct audit_chunk *find_chunk(struct node *p) +{ + int index = p->index & ~(1U<<31); + p -= index; + return container_of(p, struct audit_chunk, owners[0]); +} + +static void untag_chunk(struct node *p) { + struct audit_chunk *chunk = find_chunk(p); struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; + if (!pin_inotify_watch(&chunk->watch)) { + /* + * Filesystem is shutting down; all watches are getting + * evicted, just take it off the node list for this + * tree and let the eviction logics take care of the + * rest. + */ + owner = p->owner; + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); + return; + } + + spin_unlock(&hash_lock); + + /* + * pin_inotify_watch() succeeded, so the watch won't go away + * from under us. + */ mutex_lock(&chunk->watch.inode->inotify_mutex); if (chunk->dead) { mutex_unlock(&chunk->watch.inode->inotify_mutex); - return; + goto out; } owner = p->owner; @@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; } new = alloc_chunk(size); @@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; Fallback: // do the best we can @@ -277,6 +313,9 @@ Fallback: put_tree(owner); spin_unlock(&hash_lock); mutex_unlock(&chunk->watch.inode->inotify_mutex); +out: + unpin_inotify_watch(&chunk->watch); + spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) @@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static struct audit_chunk *find_chunk(struct node *p) -{ - int index = p->index & ~(1U<<31); - p -= index; - return container_of(p, struct audit_chunk, owners[0]); -} - static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; @@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim) spin_lock(&hash_lock); while (!list_empty(&victim->chunks)) { struct node *p; - struct audit_chunk *chunk; p = list_entry(victim->chunks.next, struct node, list); - chunk = find_chunk(p); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, p); - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(p); } spin_unlock(&hash_lock); put_tree(victim); @@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree) while (!list_empty(&tree->chunks)) { struct node *node; - struct audit_chunk *chunk; node = list_entry(tree->chunks.next, struct node, list); @@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree) if (!(node->index & (1U<<31))) break; - chunk = find_chunk(node); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, node); - - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(node); } if (!tree->root && !tree->goner) { tree->goner = 1; @@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, static void destroy_watch(struct inotify_watch *watch) { struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - free_chunk(chunk); + call_rcu(&chunk->head, __put_chunk); } static const struct inotify_operations rtree_inotify_ops = { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b7d354e2b0e..9fd85a4640a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list) list_for_each_entry_safe(p, n, in_list, ilist) { list_del(&p->ilist); inotify_rm_watch(audit_ih, &p->wdata); - /* the put matching the get in audit_do_del_rule() */ - put_inotify_watch(&p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); } } @@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry, /* Put parent on the inotify un-registration * list. Grab a reference before releasing * audit_filter_mutex, to be released in - * audit_inotify_unregister(). */ - list_add(&parent->ilist, &inotify_list); - get_inotify_watch(&parent->wdata); + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. + */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, &inotify_list); } } } -- cgit v1.2.3-70-g09d2 From e14c8bf86350f6c39186a139c5c584a6111b2f01 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 17 Nov 2008 08:22:18 +1030 Subject: stop_machine: fix race with return value (fixes Bug #11989) Bug #11989: Suspend failure on NForce4-based boards due to chanes in stop_machine We should not access active.fnret outside the lock; in theory the next stop_machine could overwrite it. Signed-off-by: Rusty Russell Tested-by: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 9bc4c00872c..24e8ceacc38 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -112,7 +112,7 @@ static int chill(void *unused) int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { struct work_struct *sm_work; - int i; + int i, ret; /* Set up initial state. */ mutex_lock(&lock); @@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) /* This will release the thread on our CPU. */ put_cpu(); flush_workqueue(stop_machine_wq); + ret = active.fnret; mutex_unlock(&lock); - return active.fnret; + return ret; } int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) -- cgit v1.2.3-70-g09d2 From 65ecc14a30ad21bed9aabdfd6a2ae1a1aaaa6a00 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Sat, 15 Nov 2008 12:02:34 -0600 Subject: Remove -mno-spe flags as they dont belong For some unknown reason at Steven Rostedt added in disabling of the SPE instruction generation for e500 based PPC cores in commit 6ec562328fda585be2d7f472cfac99d3b44d362a. We are removing it because: 1. It generates e500 kernels that don't work 2. its not the correct set of flags to do this 3. we handle this in the arch/powerpc/Makefile already 4. its unknown in talking to Steven why he did this Signed-off-by: Kumar Gala Tested-and-Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d8..19fad003b19 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,7 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -mno-spe -pg +CFLAGS_REMOVE_sched.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o -- cgit v1.2.3-70-g09d2