From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c..c574aefa8d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; -- cgit v1.2.3-70-g09d2 From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 7 Feb 2012 14:39:57 +0100 Subject: perf: Fix double start/stop in x86_pmu_start() The following patch fixes a bug introduced by the following commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") The patch caused the following warning to pop up depending on the sampling frequency adjustments: ------------[ cut here ]------------ WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4() It was caused by the following call sequence: perf_adjust_freq_unthr_context.part() { stop() if (delta > 0) { perf_adjust_period() { if (period > 8*...) { stop() ... start() } } } start() } Which caused a double start and a double stop, thus triggering the assert in x86_pmu_start(). The patch fixes the problem by avoiding the double calls. We pass a new argument to perf_adjust_period() to indicate whether or not the event is already stopped. We can't just remove the start/stop from that function because it's called from __perf_event_overflow where the event needs to be reloaded via a stop/start back-toback call. The patch reintroduces the assertion in x86_pmu_start() which was removed by commit: 84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()") In this second version, we've added calls to disable/enable PMU during unthrottling or frequency adjustment based on bug report of spurious NMI interrupts from Eric Dumazet. Reported-and-tested-by: Eric Dumazet Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: markus@trippelsdorf.de Cc: paulus@samba.org Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad [ Minor edits to the changelog and to the code ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ kernel/events/core.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2a30e5ae6ac..5adce1040b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + if (WARN_ON_ONCE(idx == -1)) return; diff --git a/kernel/events/core.c b/kernel/events/core.c index ba36013cfb2..1b5c081d8b9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2303,7 +2303,7 @@ do { \ static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, return; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, /* * restart the event * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* -- cgit v1.2.3-70-g09d2 From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Feb 2012 09:03:58 +0100 Subject: relay: prevent integer overflow in relay_open() "subbuf_size" and "n_subbufs" come from the user and they need to be capped to prevent an integer overflow. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/relay.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2..ab56a1764d4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) -- cgit v1.2.3-70-g09d2 From 10f296cbfe3b93188c41463fd7a53808ebdbcbe3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:11 +0800 Subject: module: make module param bint handle nul value Allow bint param accept nul values, just do same as bool param. Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 32ee0430828..4bc965d8a1f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -97,7 +97,8 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; pr_debug("They are equal! Calling %p\n", params[i].ops->set); -- cgit v1.2.3-70-g09d2 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- kernel/pid.c | 4 ++-- mm/page_alloc.c | 1 + net/ipv4/tcp.c | 5 +++-- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc0..fe19ac13f75 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad..d3ebdbe723d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deacc..9f08dfabaf1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f..a13ded1938f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e9..22ef5f9fd2f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); -- cgit v1.2.3-70-g09d2 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ fs/signalfd.c | 11 +++++++++++ include/asm-generic/poll.h | 2 ++ include/linux/signalfd.h | 5 ++++- kernel/fork.c | 5 ++++- 5 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf2..34bbfc6dd8d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + /* the caller holds eppoll_entry->whead->lock */ + if ((unsigned long)key & POLLFREE) + list_del_init(&wait->task_list); + spin_lock_irqsave(&ep->lock, flags); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451d..79c1eea98a3 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,17 @@ #include #include +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/include/asm-generic/poll.h b/include/asm-generic/poll.h index 44bce836d35..9ce7f44aebd 100644 --- a/include/asm-generic/poll.h +++ b/include/asm-generic/poll.h @@ -28,6 +28,8 @@ #define POLLRDHUP 0x2000 #endif +#define POLLFREE 0x4000 /* currently only for epoll */ + struct pollfd { int fd; short events; diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index 3ff4961da9b..247399b2979 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -61,13 +61,16 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig) wake_up(&tsk->sighand->signalfd_wqh); } +extern void signalfd_cleanup(struct sighand_struct *sighand); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } +static inline void signalfd_cleanup(struct sighand_struct *sighand) { } + #endif /* CONFIG_SIGNALFD */ #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNALFD_H */ - diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559c78..e2cd3e2a5ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } -- cgit v1.2.3-70-g09d2