From 58b79a91f57efec9457de8ff93a4cc4fb8daf753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Sep 2013 15:31:08 -0400 Subject: cgroup: fix cgroup post-order descendant walk of empty subtree bd8815a6d8 ("cgroup: make css_for_each_descendant() and friends include the origin css in the iteration") updated cgroup descendant iterators to include the origin css; unfortuantely, it forgot to drop special case handling in css_next_descendant_post() for empty subtree leading to failure to visit the origin css without any child. Fix it by dropping the special case handling and always returning the leftmost descendant on the first iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0aeb32415f..8075b72d22b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3187,11 +3187,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, visit the leftmost descendant */ - if (!pos) { - next = css_leftmost_descendant(root); - return next != root ? next : NULL; - } + /* if first iteration, visit leftmost descendant which may be @root */ + if (!pos) + return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) -- cgit v1.2.3-70-g09d2 From fa7315871046b9a4c48627905691dbde57e51033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Sep 2013 10:16:42 +0200 Subject: perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page' Solve the problems around the broken definition of perf_event_mmap_page:: cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially fixed by: 860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'") The problem with the fix (merged in v3.12-rc1 and not yet released officially), noticed by Vince Weaver is that the new behavior is not detectable by new user-space, and that due to the reuse of the field names it's easy to mis-compile a binary if old headers are used on a new kernel or new headers are used on an old kernel. To solve all that make this change explicit, detectable and self-contained, by iterating the ABI the following way: - Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not confuse old user-space binaries. RDPMC will be marked as unavailable to old binaries but that's within the ABI, this is a capability bit. - Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new libraries can reliably detect that bit 0 is deprecated and perma-zero without having to check the kernel version. - Use bits 2, 3, 4 for the newly defined, correct functionality: cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ cap_user_time : 1, /* The time_* fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - Rename all the bitfield names in perf_event.h to be different from the old names, to make sure it's not possible to mis-compile it accidentally with old assumptions. The 'size' field can then be used in the future to add new fields and it will act as a natural ABI version indicator as well. Also adjust tools/perf/ userspace for the new definitions, noticed by Adrian Hunter. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Also-Fixed-by: Adrian Hunter Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++----- include/uapi/linux/perf_event.h | 14 +++++++++----- kernel/events/core.c | 21 +++++++++++++++++++++ tools/perf/arch/x86/util/tsc.c | 6 +++--- 4 files changed, 38 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8355c84b972..a9c606bb494 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,9 +1883,9 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_time_zero = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -1894,13 +1894,13 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; if (sched_clock_stable && !check_tsc_disabled()) { - userpg->cap_usr_time_zero = 1; + userpg->cap_user_time_zero = 1; userpg->time_zero = this_cpu_read(cyc2ns_offset); } } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7f6d584c267..009a655a5d3 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -380,10 +380,13 @@ struct perf_event_mmap_page { union { __u64 capabilities; struct { - __u64 cap_usr_time : 1, - cap_usr_rdpmc : 1, - cap_usr_time_zero : 1, - cap_____res : 61; + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_* fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_____res : 59; }; }; @@ -442,12 +445,13 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[119]; /* align to 1k */ + __u8 __reserved[118*8+4]; /* align to 1k. */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dd236b66ca3..cb4238e85b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3660,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } +static void perf_event_init_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + + userpg = rb->user_page; + + /* Allow new userspace to detect that bit 0 is deprecated */ + userpg->cap_bit0_is_deprecated = 1; + userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + +unlock: + rcu_read_unlock(); +} + void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -4044,6 +4064,7 @@ again: ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); + perf_event_init_userpage(event); perf_event_update_userpage(event); unlock: diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index 9570c2b0f83..b2519e49424 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -32,7 +32,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc) int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, struct perf_tsc_conversion *tc) { - bool cap_usr_time_zero; + bool cap_user_time_zero; u32 seq; int i = 0; @@ -42,7 +42,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, tc->time_mult = pc->time_mult; tc->time_shift = pc->time_shift; tc->time_zero = pc->time_zero; - cap_usr_time_zero = pc->cap_usr_time_zero; + cap_user_time_zero = pc->cap_user_time_zero; rmb(); if (pc->lock == seq && !(seq & 1)) break; @@ -52,7 +52,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, } } - if (!cap_usr_time_zero) + if (!cap_user_time_zero) return -EOPNOTSUPP; return 0; -- cgit v1.2.3-70-g09d2 From b18855500fc40da050512d9df82d2f1471e59642 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 17:49:13 +0400 Subject: sched/balancing: Fix 'local->avg_load > sds->avg_load' case in calculate_imbalance() In busiest->group_imb case we can come to calculate_imbalance() with local->avg_load >= busiest->avg_load >= sds->avg_load. This can result in imbalance overflow, because it is calculated as follows env->imbalance = min( max_pull * busiest->group_power, (sds->avg_load - local->avg_load) * local->group_power) / SCHED_POWER_SCALE; As a result we can end up constantly bouncing tasks from one cpu to another if there are pinned tasks. Fix this by skipping the assignment and assuming imbalance=0 in case local->avg_load > sds->avg_load. [ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus belonging to different cores on an HT-enabled machine with N logical cpus: just look at se.nr_migrations growth. ] Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/8f596cc6bc0e5e655119dc892c9bfcad26e971f4.1379252740.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11cd1366735..0b99aae339c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4896,7 +4896,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (busiest->avg_load < sds->avg_load) { + if (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } -- cgit v1.2.3-70-g09d2 From 3029ede39373c368f402a76896600d85a4f7121b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 17:49:14 +0400 Subject: sched/balancing: Fix 'local->avg_load > busiest->avg_load' case in fix_small_imbalance() In busiest->group_imb case we can come to fix_small_imbalance() with local->avg_load > busiest->avg_load. This can result in wrong imbalance fix-up, because there is the following check there where all the members are unsigned: if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } As a result we can end up constantly bouncing tasks from one cpu to another if there are pinned tasks. Fix it by substituting the subtraction with an equivalent addition in the check. [ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus belonging to different cores on an HT-enabled machine with N logical cpus: just look at se.nr_migrations growth. ] Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ef167822e5c5b2d96cf5b0e3e4f4bdff3f0414a2.1379252740.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b99aae339c..2aedaccebcc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4823,8 +4823,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) (busiest->load_per_task * SCHED_POWER_SCALE) / busiest->group_power; - if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { + if (busiest->avg_load + scaled_busy_load_per_task >= + local->avg_load + (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } -- cgit v1.2.3-70-g09d2 From 7e3115ef5149fc502e3a2e80719dba54a8e7409d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sat, 14 Sep 2013 19:39:46 +0400 Subject: sched/balancing: Fix cfs_rq->task_h_load calculation Patch a003a2 (sched: Consider runnable load average in move_tasks()) sets all top-level cfs_rqs' h_load to rq->avg.load_avg_contrib, which is always 0. This mistype leads to all tasks having weight 0 when load balancing in a cpu-cgroup enabled setup. There obviously should be sum of weights of all runnable tasks there instead. Fix it. Signed-off-by: Vladimir Davydov Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379173186-11944-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2aedaccebcc..7c70201fbc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4242,7 +4242,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->h_load = cfs_rq->runnable_load_avg; cfs_rq->last_h_load_update = now; } -- cgit v1.2.3-70-g09d2 From 0c06a5d4b13cd66c833805a0d1db76b977944aac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Sep 2013 00:54:17 +0200 Subject: arm: Fix build error with context tracking calls ad65782fba50 (context_tracking: Optimize main APIs off case with static key) converted context tracking main APIs to inline function and left ARM asm callers behind. This can be easily fixed by making ARM calling the post static keys context tracking function. We just need to replicate the static key checks there. We'll remove these later when ARM will support the context tracking static keys. Reported-by: Guenter Roeck Reported-by: Russell King Signed-off-by: Frederic Weisbecker Tested-by: Kevin Hilman Cc: Nicolas Pitre Cc: Anil Kumar Cc: Tony Lindgren Cc: Benoit Cousson Cc: Guenter Roeck Cc: Russell King Cc: Kevin Hilman --- arch/arm/kernel/entry-header.S | 8 ++++---- kernel/context_tracking.c | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index de23a9beed1..39f89fbd511 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -329,10 +329,10 @@ #ifdef CONFIG_CONTEXT_TRACKING .if \save stmdb sp!, {r0-r3, ip, lr} - bl user_exit + bl context_tracking_user_exit ldmia sp!, {r0-r3, ip, lr} .else - bl user_exit + bl context_tracking_user_exit .endif #endif .endm @@ -341,10 +341,10 @@ #ifdef CONFIG_CONTEXT_TRACKING .if \save stmdb sp!, {r0-r3, ip, lr} - bl user_enter + bl context_tracking_user_enter ldmia sp!, {r0-r3, ip, lr} .else - bl user_enter + bl context_tracking_user_enter .endif #endif .endm diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 247091bf058..859c8dfd78a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -50,6 +50,15 @@ void context_tracking_user_enter(void) { unsigned long flags; + /* + * Repeat the user_enter() check here because some archs may be calling + * this from asm and if no CPU needs context tracking, they shouldn't + * go further. Repeat the check here until they support the static key + * check. + */ + if (!static_key_false(&context_tracking_enabled)) + return; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -151,6 +160,9 @@ void context_tracking_user_exit(void) { unsigned long flags; + if (!static_key_false(&context_tracking_enabled)) + return; + if (in_interrupt()) return; -- cgit v1.2.3-70-g09d2 From 3a126f85e015701e56240884f27f97543580d5f7 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 27 Sep 2013 13:17:39 -0700 Subject: kernel/params: fix handling of signed integer types Commit 6072ddc8520b ("kernel: replace strict_strto*() with kstrto*()") broke the handling of signed integer types, fix it. Signed-off-by: Jean Delvare Reported-by: Christian Kujau Tested-by: Christian Kujau Cc: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 81c4e78c8f4..c00d5b502aa 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -254,11 +254,11 @@ int parse_args(const char *doing, STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) -- cgit v1.2.3-70-g09d2 From aab1728915420b5288cd0fc7b5bd320105b48983 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 30 Sep 2013 19:40:56 +0200 Subject: PM / hibernate: Fix user space driven resume regression Recent commit 8fd37a4 (PM / hibernate: Create memory bitmaps after freezing user space) broke the resume part of the user space driven hibernation (s2disk), because I forgot that the resume utility loaded the image into memory without freezing user space (it still freezes tasks after loading the image). This means that during user space driven resume we need to create the memory bitmaps at the "device open" time rather than at the "freeze tasks" time, so make that happen (that's a special case anyway, so it needs to be treated in a special way). Reported-and-tested-by: Ronald Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 ++++- kernel/power/user.c | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 358a146fd4d..98c3b34a4cf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void) struct memory_bitmap *bm1, *bm2; int error = 0; - BUG_ON(forbidden_pages_map || free_pages_map); + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); if (!bm1) diff --git a/kernel/power/user.c b/kernel/power/user.c index 72e8f4fd616..957f06164ad 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -39,6 +39,7 @@ static struct snapshot_data { char frozen; char ready; char platform_support; + bool free_bitmaps; } snapshot_state; atomic_t snapshot_device_available = ATOMIC_INIT(1); @@ -82,6 +83,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } if (error) pm_notifier_call_chain(PM_POST_RESTORE); } @@ -111,6 +116,8 @@ static int snapshot_release(struct inode *inode, struct file *filp) pm_restore_gfp_mask(); free_basic_memory_bitmaps(); thaw_processes(); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); @@ -231,6 +238,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; pm_restore_gfp_mask(); free_basic_memory_bitmaps(); + data->free_bitmaps = false; thaw_processes(); data->frozen = 0; break; -- cgit v1.2.3-70-g09d2 From 4c1c7be95c345cf2ad537a0c48e9aeadc7304527 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 30 Sep 2013 13:45:08 -0700 Subject: kernel/kmod.c: check for NULL in call_usermodehelper_exec() If /proc/sys/kernel/core_pattern contains only "|", a NULL pointer dereference happens upon core dump because argv_split("") returns argv[0] == NULL. This bug was once fixed by commit 264b83c07a84 ("usermodehelper: check subprocess_info->path != NULL") but was by error reintroduced by commit 7f57cfa4e2aa ("usermodehelper: kill the sub_info->path[0] check"). This bug seems to exist since 2.6.19 (the version which core dump to pipe was added). Depending on kernel version and config, some side effect might happen immediately after this oops (e.g. kernel panic with 2.6.32-358.18.1.el6). Signed-off-by: Tetsuo Handa Acked-by: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index fb326365b69..b086006c59e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + if (!sub_info->path) { + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; + } helper_lock(); if (!khelper_wq || usermodehelper_disabled) { retval = -EBUSY; -- cgit v1.2.3-70-g09d2 From 314a8ad0f18ac37887896b288939acd8cb17e208 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Sep 2013 13:45:27 -0700 Subject: pidns: fix free_pid() to handle the first fork failure "case 0" in free_pid() assumes that disable_pid_allocation() should clear PIDNS_HASH_ADDING before the last pid goes away. However this doesn't happen if the first fork() fails to create the child reaper which should call disable_pid_allocation(). Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ebe5e80b10f..9b9a2669814 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -273,6 +273,11 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; + case PIDNS_HASH_ADDING: + /* Handle a fork failure of the first process */ + WARN_ON(ns->child_reaper); + ns->nr_hashed = 0; + /* fall through */ case 0: schedule_work(&ns->proc_work); break; -- cgit v1.2.3-70-g09d2 From ded797547548a5b8e7b92383a41e4c0e6b0ecb7f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 00:50:25 +0200 Subject: irq: Force hardirq exit's softirq processing on its own stack The commit facd8b80c67a3cf64a467c4a2ac5fb31f2e6745b ("irq: Sanitize invoke_softirq") converted irq exit calls of do_softirq() to __do_softirq() on all architectures, assuming it was only used there for its irq disablement properties. But as a side effect, the softirqs processed in the end of the hardirq are always called on the inline current stack that is used by irq_exit() instead of the softirq stack provided by the archs that override do_softirq(). The result is mostly safe if the architecture runs irq_exit() on a separate irq stack because then softirqs are processed on that same stack that is near empty at this stage (assuming hardirq aren't nesting). Otherwise irq_exit() runs in the task stack and so does the softirq too. The interrupted call stack can be randomly deep already and the softirq can dig through it even further. To add insult to the injury, this softirq can be interrupted by a new hardirq, maximizing the chances for a stack overrun as reported in powerpc for example: do_IRQ: stack overflow: 1920 CPU: 0 PID: 1602 Comm: qemu-system-ppc Not tainted 3.10.4-300.1.fc19.ppc64p7 #1 Call Trace: [c0000000050a8740] .show_stack+0x130/0x200 (unreliable) [c0000000050a8810] .dump_stack+0x28/0x3c [c0000000050a8880] .do_IRQ+0x2b8/0x2c0 [c0000000050a8930] hardware_interrupt_common+0x154/0x180 --- Exception: 501 at .cp_start_xmit+0x3a4/0x820 [8139cp] LR = .cp_start_xmit+0x390/0x820 [8139cp] [c0000000050a8d40] .dev_hard_start_xmit+0x394/0x640 [c0000000050a8e00] .sch_direct_xmit+0x110/0x260 [c0000000050a8ea0] .dev_queue_xmit+0x260/0x630 [c0000000050a8f40] .br_dev_queue_push_xmit+0xc4/0x130 [bridge] [c0000000050a8fc0] .br_dev_xmit+0x198/0x270 [bridge] [c0000000050a9070] .dev_hard_start_xmit+0x394/0x640 [c0000000050a9130] .dev_queue_xmit+0x428/0x630 [c0000000050a91d0] .ip_finish_output+0x2a4/0x550 [c0000000050a9290] .ip_local_out+0x50/0x70 [c0000000050a9310] .ip_queue_xmit+0x148/0x420 [c0000000050a93b0] .tcp_transmit_skb+0x4e4/0xaf0 [c0000000050a94a0] .__tcp_ack_snd_check+0x7c/0xf0 [c0000000050a9520] .tcp_rcv_established+0x1e8/0x930 [c0000000050a95f0] .tcp_v4_do_rcv+0x21c/0x570 [c0000000050a96c0] .tcp_v4_rcv+0x734/0x930 [c0000000050a97a0] .ip_local_deliver_finish+0x184/0x360 [c0000000050a9840] .ip_rcv_finish+0x148/0x400 [c0000000050a98d0] .__netif_receive_skb_core+0x4f8/0xb00 [c0000000050a99d0] .netif_receive_skb+0x44/0x110 [c0000000050a9a70] .br_handle_frame_finish+0x2bc/0x3f0 [bridge] [c0000000050a9b20] .br_nf_pre_routing_finish+0x2ac/0x420 [bridge] [c0000000050a9bd0] .br_nf_pre_routing+0x4dc/0x7d0 [bridge] [c0000000050a9c70] .nf_iterate+0x114/0x130 [c0000000050a9d30] .nf_hook_slow+0xb4/0x1e0 [c0000000050a9e00] .br_handle_frame+0x290/0x330 [bridge] [c0000000050a9ea0] .__netif_receive_skb_core+0x34c/0xb00 [c0000000050a9fa0] .netif_receive_skb+0x44/0x110 [c0000000050aa040] .napi_gro_receive+0xe8/0x120 [c0000000050aa0c0] .cp_rx_poll+0x31c/0x590 [8139cp] [c0000000050aa1d0] .net_rx_action+0x1dc/0x310 [c0000000050aa2b0] .__do_softirq+0x158/0x330 [c0000000050aa3b0] .irq_exit+0xc8/0x110 [c0000000050aa430] .do_IRQ+0xdc/0x2c0 [c0000000050aa4e0] hardware_interrupt_common+0x154/0x180 --- Exception: 501 at .bad_range+0x1c/0x110 LR = .get_page_from_freelist+0x908/0xbb0 [c0000000050aa7d0] .list_del+0x18/0x50 (unreliable) [c0000000050aa850] .get_page_from_freelist+0x908/0xbb0 [c0000000050aa9e0] .__alloc_pages_nodemask+0x21c/0xae0 [c0000000050aaba0] .alloc_pages_vma+0xd0/0x210 [c0000000050aac60] .handle_pte_fault+0x814/0xb70 [c0000000050aad50] .__get_user_pages+0x1a4/0x640 [c0000000050aae60] .get_user_pages_fast+0xec/0x160 [c0000000050aaf10] .__gfn_to_pfn_memslot+0x3b0/0x430 [kvm] [c0000000050aafd0] .kvmppc_gfn_to_pfn+0x64/0x130 [kvm] [c0000000050ab070] .kvmppc_mmu_map_page+0x94/0x530 [kvm] [c0000000050ab190] .kvmppc_handle_pagefault+0x174/0x610 [kvm] [c0000000050ab270] .kvmppc_handle_exit_pr+0x464/0x9b0 [kvm] [c0000000050ab320] kvm_start_lightweight+0x1ec/0x1fc [kvm] [c0000000050ab4f0] .kvmppc_vcpu_run_pr+0x168/0x3b0 [kvm] [c0000000050ab9c0] .kvmppc_vcpu_run+0xc8/0xf0 [kvm] [c0000000050aba50] .kvm_arch_vcpu_ioctl_run+0x5c/0x1a0 [kvm] [c0000000050abae0] .kvm_vcpu_ioctl+0x478/0x730 [kvm] [c0000000050abc90] .do_vfs_ioctl+0x4ec/0x7c0 [c0000000050abd80] .SyS_ioctl+0xd4/0xf0 [c0000000050abe30] syscall_exit+0x0/0x98 Since this is a regression, this patch proposes a minimalistic and low-risk solution by blindly forcing the hardirq exit processing of softirqs on the softirq stack. This way we should reduce significantly the opportunities for task stack overflow dug by softirqs. Longer term solutions may involve extending the hardirq stack coverage to irq_exit(), etc... Reported-by: Benjamin Herrenschmidt Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: #3.9.. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- kernel/softirq.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 53cc09ceb0b..d7d498d8cc4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -328,10 +328,19 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) - __do_softirq(); - else + if (!force_irqthreads) { + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. But we have no way to know if the arch + * calls irq_exit() on the irq stack. So call softirq + * in its own stack to prevent from any overrun on top + * of a potentially deep task stack. + */ + do_softirq(); + } else { wakeup_softirqd(); + } } static inline void tick_irq_exit(void) -- cgit v1.2.3-70-g09d2 From 9886167d20c0720dcfb01e62cdff4d906b226f43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Oct 2013 16:02:23 +0200 Subject: perf: Fix perf_pmu_migrate_context While auditing the list_entry usage due to a trinity bug I found that perf_pmu_migrate_context violates the rules for perf_event::event_entry. The problem is that perf_event::event_entry is a RCU list element, and hence we must wait for a full RCU grace period before re-using the element after deletion. Therefore the usage in perf_pmu_migrate_context() which re-uses the entry immediately is broken. For now introduce another list_head into perf_event for this specific usage. This doesn't actually fix the trinity report because that never goes through this code. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-mkj72lxagw1z8fvjm648iznw@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 +++++++++++++++++++++++- kernel/events/core.c | 6 +++--- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 866e85c5eb9..c8ba627c1d6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -294,9 +294,31 @@ struct ring_buffer; */ struct perf_event { #ifdef CONFIG_PERF_EVENTS - struct list_head group_entry; + /* + * entry onto perf_event_context::event_list; + * modifications require ctx->lock + * RCU safe iterations. + */ struct list_head event_entry; + + /* + * XXX: group_entry and sibling_list should be mutually exclusive; + * either you're a sibling on a group, or you're the group leader. + * Rework the code to always use the same list element. + * + * Locked for modification by both ctx->mutex and ctx->lock; holding + * either sufficies for read. + */ + struct list_head group_entry; struct list_head sibling_list; + + /* + * We need storage to track the entries in perf_pmu_migrate_context; we + * cannot use the event_entry because of RCU and we want to keep the + * group in tact which avoids us using the other two entries. + */ + struct list_head migrate_entry; + struct hlist_node hlist_entry; int nr_siblings; int group_flags; diff --git a/kernel/events/core.c b/kernel/events/core.c index cb4238e85b3..d49a9d29334 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7234,15 +7234,15 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) perf_remove_from_context(event); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); - list_add(&event->event_entry, &events); + list_add(&event->migrate_entry, &events); } mutex_unlock(&src_ctx->mutex); synchronize_rcu(); mutex_lock(&dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &events, event_entry) { - list_del(&event->event_entry); + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; account_event_cpu(event, dst_cpu); -- cgit v1.2.3-70-g09d2 From ea84753c98a7ac6b74e530b64c444a912b3835ca Mon Sep 17 00:00:00 2001 From: Anjana V Kumar Date: Sat, 12 Oct 2013 10:59:17 +0800 Subject: cgroup: fix to break the while loop in cgroup_attach_task() correctly Both Anjana and Eunki reported a stall in the while_each_thread loop in cgroup_attach_task(). It's because, when we attach a single thread to a cgroup, if the cgroup is exiting or is already in that cgroup, we won't break the loop. If the task is already in the cgroup, the bug can lead to another thread being attached to the cgroup unexpectedly: # echo 5207 > tasks # cat tasks 5207 # echo 5207 > tasks # cat tasks 5207 5215 What's worse, if the task to be attached isn't the leader of the thread group, we might never exit the loop, hence cpu stall. Thanks for Oleg's analysis. This bug was introduced by commit 081aa458c38ba576bdd4265fc807fa95b48b9e79 ("cgroup: consolidate cgroup_attach_task() and cgroup_attach_proc()") [ lizf: - fixed the first continue, pointed out by Oleg, - rewrote changelog. ] Cc: # 3.9+ Reported-by: Eunki Kim Reported-by: Anjana V Kumar Signed-off-by: Anjana V Kumar Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8075b72d22b..1bf4f7a1270 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2038,7 +2038,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) - continue; + goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); @@ -2046,7 +2046,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) - continue; + goto next; /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2054,7 +2054,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - + next: if (!threadgroup) break; } while_each_thread(leader, tsk); -- cgit v1.2.3-70-g09d2 From 3090ffb5a2515990182f3f55b0688a7817325488 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 17 Oct 2013 19:32:15 +0200 Subject: perf: Disable PERF_RECORD_MMAP2 support For now, we disable the extended MMAP record support (MMAP2). We have identified cases where it would not report the correct mapping information, clone(VM_CLONE) but with separate pids. We will revisit the support once we find a solution for this case. The patch changes the kernel to return EINVAL if attr->mmap2 is set. The patch also modifies the perf tool to use regular PERF_RECORD_MMAP for synthetic events and it also prevents the tool from requesting attr->mmap2 mode because the kernel would reject it. The support will be revisited once the kenrel interface is updated. In V2, we reduce the patch to the strict minimum. In V3, we avoid calling perf_event_open() with mmap2 set because we know it will fail and require fallback retry. Signed-off-by: Stephane Eranian Cc: Andi Kleen Cc: David Ahern Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20131017173215.GA8820@quad Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++++ tools/perf/util/event.c | 30 +++++++++++++----------------- tools/perf/util/evsel.c | 1 - 3 files changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d29334..953c1434837 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6767,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + /* disabled for now */ + if (attr->mmap2) + return -EINVAL; + if (attr->__reserved_1) return -EINVAL; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 9b393e7dca6..63df031fc9c 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -187,7 +187,7 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, return -1; } - event->header.type = PERF_RECORD_MMAP2; + event->header.type = PERF_RECORD_MMAP; /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c */ @@ -198,7 +198,6 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, char prot[5]; char execname[PATH_MAX]; char anonstr[] = "//anon"; - unsigned int ino; size_t size; ssize_t n; @@ -209,13 +208,10 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, ""); /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n", - &event->mmap2.start, &event->mmap2.len, prot, - &event->mmap2.pgoff, &event->mmap2.maj, - &event->mmap2.min, - &ino, execname); - - event->mmap2.ino = (u64)ino; + n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %*x:%*x %*u %s\n", + &event->mmap.start, &event->mmap.len, prot, + &event->mmap.pgoff, + execname); if (n != 8) continue; @@ -227,15 +223,15 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, anonstr); size = strlen(execname) + 1; - memcpy(event->mmap2.filename, execname, size); + memcpy(event->mmap.filename, execname, size); size = PERF_ALIGN(size, sizeof(u64)); - event->mmap2.len -= event->mmap.start; - event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); - event->mmap2.header.size += machine->id_hdr_size; - event->mmap2.pid = tgid; - event->mmap2.tid = pid; + event->mmap.len -= event->mmap.start; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, machine->id_hdr_size); + event->mmap.header.size += machine->id_hdr_size; + event->mmap.pid = tgid; + event->mmap.tid = pid; if (process(tool, event, &synth_sample, machine) != 0) { rc = -1; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 0ce9febf1ba..9f1ef9bee2d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -678,7 +678,6 @@ void perf_evsel__config(struct perf_evsel *evsel, attr->sample_type |= PERF_SAMPLE_WEIGHT; attr->mmap = track; - attr->mmap2 = track && !perf_missing_features.mmap2; attr->comm = track; /* -- cgit v1.2.3-70-g09d2 From b0267507dfd0187fb7840a0ec461a510a7f041c5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Oct 2013 19:45:29 +0900 Subject: mutex: Avoid gcc version dependent __builtin_constant_p() usage Commit 040a0a37 ("mutex: Add support for wound/wait style locks") used "!__builtin_constant_p(p == NULL)" but gcc 3.x cannot handle such expression correctly, leading to boot failure when built with CONFIG_DEBUG_MUTEXES=y. Fix it by explicitly passing a bool which tells whether p != NULL or not. [ PeterZ: This is a sad patch, but provided it actually generates similar code I suppose its the best we can do bar whole sale deprecating gcc-3. ] Signed-off-by: Tetsuo Handa Acked-by: Peter Zijlstra Acked-by: Maarten Lankhorst Cc: peterz@infradead.org Cc: imirkin@alum.mit.edu Cc: daniel.vetter@ffwll.ch Cc: robdclark@gmail.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/201310171945.AGB17114.FSQVtHOJFOOFML@I-love.SAKURA.ne.jp Signed-off-by: Ingo Molnar --- kernel/mutex.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 6d647aedffe..d24105b1b79 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; @@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -551,7 +551,7 @@ slowpath: goto err; } - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { ret = __mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; @@ -575,7 +575,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct mutex_waiter *cur; @@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL); + 0, nest, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) struct mutex *lock = container_of(lock_count, struct mutex, count); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } static noinline int __sched @@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } #endif -- cgit v1.2.3-70-g09d2 From 97b9410643475d6557d2517c2aff9fd2221141a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Sep 2013 21:50:23 +0200 Subject: clockevents: Sanitize ticks to nsec conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marc Kleine-Budde pointed out, that commit 77cc982 "clocksource: use clockevents_config_and_register() where possible" caused a regression for some of the converted subarchs. The reason is, that the clockevents core code converts the minimal hardware tick delta to a nanosecond value for core internal usage. This conversion is affected by integer math rounding loss, so the backwards conversion to hardware ticks will likely result in a value which is less than the configured hardware limitation. The affected subarchs used their own workaround (SIGH!) which got lost in the conversion. The solution for the issue at hand is simple: adding evt->mult - 1 to the shifted value before the integer divison in the core conversion function takes care of it. But this only works for the case where for the scaled math mult/shift pair "mult <= 1 << shift" is true. For the case where "mult > 1 << shift" we can apply the rounding add only for the minimum delta value to make sure that the backward conversion is not less than the given hardware limit. For the upper bound we need to omit the rounding add, because the backwards conversion is always larger than the original latch value. That would violate the upper bound of the hardware device. Though looking closer at the details of that function reveals another bogosity: The upper bounds check is broken as well. Checking for a resulting "clc" value greater than KTIME_MAX after the conversion is pointless. The conversion does: u64 clc = (latch << evt->shift) / evt->mult; So there is no sanity check for (latch << evt->shift) exceeding the 64bit boundary. The latch argument is "unsigned long", so on a 64bit arch the handed in argument could easily lead to an unnoticed shift overflow. With the above rounding fix applied the calculation before the divison is: u64 clc = (latch << evt->shift) + evt->mult - 1; So we need to make sure, that neither the shift nor the rounding add is overflowing the u64 boundary. [ukl: move assignment to rnd after eventually changing mult, fix build issue and correct comment with the right math] Signed-off-by: Thomas Gleixner Cc: Russell King - ARM Linux Cc: Marc Kleine-Budde Cc: nicolas.ferre@atmel.com Cc: Marc Pignat Cc: john.stultz@linaro.org Cc: kernel@pengutronix.de Cc: Ronald Wahl Cc: LAK Cc: Ludovic Desroches Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1380052223-24139-1-git-send-email-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- kernel/time/clockevents.c | 65 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c86678..662c5798a68 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -33,29 +33,64 @@ struct ce_unbind { int res; }; -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** -- cgit v1.2.3-70-g09d2 From d3c345dbc7c083414ef74eb22ff26ba2bd100759 Mon Sep 17 00:00:00 2001 From: Russ Dill Date: Thu, 24 Oct 2013 14:25:26 +0100 Subject: PM / hibernate: Move software_resume to late_initcall_sync software_resume is being called after deferred_probe_initcall in drivers base. If the probing of the device that contains the resume image is deferred, and the system has been instructed to wait for it to show up, this wait will occur in software_resume. This causes a deadlock. Move software_resume into late_initcall_sync so that it happens after all the other late_initcalls. Signed-off-by: Russ Dill Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9c759d5a15..0121dab83f4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -846,7 +846,7 @@ static int software_resume(void) goto Finish; } -late_initcall(software_resume); +late_initcall_sync(software_resume); static const char * const hibernation_modes[] = { -- cgit v1.2.3-70-g09d2