From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Feb 2012 07:51:30 +0100
Subject: block: strip out locking optimization in put_io_context()

put_io_context() performed a complex trylock dancing to avoid
deferring ioc release to workqueue.  It was also broken on UP because
trylock was always assumed to succeed which resulted in unbalanced
preemption count.

While there are ways to fix the UP breakage, even the most
pathological microbench (forced ioc allocation and tight fork/exit
loop) fails to show any appreciable performance benefit of the
optimization.  Strip it out.  If there turns out to be workloads which
are affected by this change, simpler optimization from the discussion
thread can be applied later.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <1328514611.21268.66.camel@sli10-conroe>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 051f090d40c..c574aefa8d1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 			return -ENOMEM;
 
 		new_ioc->ioprio = ioc->ioprio;
-		put_io_context(new_ioc, NULL);
+		put_io_context(new_ioc);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 7 Feb 2012 14:39:57 +0100
Subject: perf: Fix double start/stop in x86_pmu_start()

The following patch fixes a bug introduced by the following
commit:

        e050e3f0a71b ("perf: Fix broken interrupt rate throttling")

The patch caused the following warning to pop up depending on
the sampling frequency adjustments:

  ------------[ cut here ]------------
  WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4()

It was caused by the following call sequence:

perf_adjust_freq_unthr_context.part() {
     stop()
     if (delta > 0) {
          perf_adjust_period() {
              if (period > 8*...) {
                  stop()
                  ...
                  start()
              }
          }
      }
      start()
}

Which caused a double start and a double stop, thus triggering
the assert in x86_pmu_start().

The patch fixes the problem by avoiding the double calls. We
pass a new argument to perf_adjust_period() to indicate whether
or not the event is already stopped. We can't just remove the
start/stop from that function because it's called from
__perf_event_overflow where the event needs to be reloaded via a
stop/start back-toback call.

The patch reintroduces the assertion in x86_pmu_start() which
was removed by commit:

	84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()")

In this second version, we've added calls to disable/enable PMU
during unthrottling or frequency adjustment based on bug report
of spurious NMI interrupts from Eric Dumazet.

Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: markus@trippelsdorf.de
Cc: paulus@samba.org
Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad
[ Minor edits to the changelog and to the code ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c |  3 +++
 kernel/events/core.c             | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2a30e5ae6ac..5adce1040b1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx = event->hw.idx;
 
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
 	if (WARN_ON_ONCE(idx == -1))
 		return;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ba36013cfb2..1b5c081d8b9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2303,7 +2303,7 @@ do {					\
 static DEFINE_PER_CPU(int, perf_throttled_count);
 static DEFINE_PER_CPU(u64, perf_throttled_seq);
 
-static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	s64 period, sample_period;
@@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	hwc->sample_period = sample_period;
 
 	if (local64_read(&hwc->period_left) > 8*sample_period) {
-		event->pmu->stop(event, PERF_EF_UPDATE);
+		if (disable)
+			event->pmu->stop(event, PERF_EF_UPDATE);
+
 		local64_set(&hwc->period_left, 0);
-		event->pmu->start(event, PERF_EF_RELOAD);
+
+		if (disable)
+			event->pmu->start(event, PERF_EF_RELOAD);
 	}
 }
 
@@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 		return;
 
 	raw_spin_lock(&ctx->lock);
+	perf_pmu_disable(ctx->pmu);
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 		/*
 		 * restart the event
 		 * reload only if value has changed
+		 * we have stopped the event so tell that
+		 * to perf_adjust_period() to avoid stopping it
+		 * twice.
 		 */
 		if (delta > 0)
-			perf_adjust_period(event, period, delta);
+			perf_adjust_period(event, period, delta, false);
 
 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
 	}
 
+	perf_pmu_enable(ctx->pmu);
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event,
 		hwc->freq_time_stamp = now;
 
 		if (delta > 0 && delta < 2*TICK_NSEC)
-			perf_adjust_period(event, delta, hwc->last_period);
+			perf_adjust_period(event, delta, hwc->last_period, true);
 	}
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 10 Feb 2012 09:03:58 +0100
Subject: relay: prevent integer overflow in relay_open()

"subbuf_size" and "n_subbufs" come from the user and they need to be
capped to prevent an integer overflow.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/relay.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 4335e1d7ee2..ab56a1764d4 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -164,10 +164,14 @@ depopulate:
  */
 static struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
-	struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
-	if (!buf)
+	struct rchan_buf *buf;
+
+	if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
 		return NULL;
 
+	buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
 	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
 	if (!buf->padding)
 		goto free_buf;
@@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename,
 
 	if (!(subbuf_size && n_subbufs))
 		return NULL;
+	if (subbuf_size > UINT_MAX / n_subbufs)
+		return NULL;
 
 	chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
 	if (!chan)
-- 
cgit v1.2.3-70-g09d2


From 10f296cbfe3b93188c41463fd7a53808ebdbcbe3 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Wed, 1 Feb 2012 10:33:11 +0800
Subject: module: make module param bint handle nul value

Allow bint param accept nul values, just do same as bool param.

Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 32ee0430828..4bc965d8a1f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -97,7 +97,8 @@ static int parse_one(char *param,
 	for (i = 0; i < num_params; i++) {
 		if (parameq(param, params[i].name)) {
 			/* No one handled NULL, so do it here. */
-			if (!val && params[i].ops->set != param_set_bool)
+			if (!val && params[i].ops->set != param_set_bool
+			    && params[i].ops->set != param_set_bint)
 				return -EINVAL;
 			pr_debug("They are equal!  Calling %p\n",
 			       params[i].ops->set);
-- 
cgit v1.2.3-70-g09d2


From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 8 Feb 2012 12:39:07 -0800
Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts

When the number of dentry cache hash table entries gets too high
(2147483648 entries), as happens by default on a 16TB system, use of a
signed integer in the dcache_init() initialization loop prevents the
dentry_hashtable from getting initialized, causing a panic in
__d_lookup().  Fix this in dcache_init() and similar areas.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c     | 8 ++++----
 fs/inode.c      | 8 ++++----
 kernel/pid.c    | 4 ++--
 mm/page_alloc.c | 1 +
 net/ipv4/tcp.c  | 5 +++--
 5 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/fs/dcache.c b/fs/dcache.c
index 16a53cc2cc0..fe19ac13f75 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries);
 
 static void __init dcache_init_early(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
@@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void)
 					&d_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << d_hash_shift); loop++)
+	for (loop = 0; loop < (1U << d_hash_shift); loop++)
 		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
 static void __init dcache_init(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* 
 	 * A constructor could be added for stable state like the lists,
@@ -3016,7 +3016,7 @@ static void __init dcache_init(void)
 					&d_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << d_hash_shift); loop++)
+	for (loop = 0; loop < (1U << d_hash_shift); loop++)
 		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index fb10d86ffad..d3ebdbe723d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries);
  */
 void __init inode_init_early(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
@@ -1669,13 +1669,13 @@ void __init inode_init_early(void)
 					&i_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << i_hash_shift); loop++)
+	for (loop = 0; loop < (1U << i_hash_shift); loop++)
 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
 void __init inode_init(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache",
@@ -1699,7 +1699,7 @@ void __init inode_init(void)
 					&i_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << i_hash_shift); loop++)
+	for (loop = 0; loop < (1U << i_hash_shift); loop++)
 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index ce8e00deacc..9f08dfabaf1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
  */
 void __init pidhash_init(void)
 {
-	int i, pidhash_size;
+	unsigned int i, pidhash_size;
 
 	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
 					   HASH_EARLY | HASH_SMALL,
 					   &pidhash_shift, NULL, 4096);
-	pidhash_size = 1 << pidhash_shift;
+	pidhash_size = 1U << pidhash_shift;
 
 	for (i = 0; i < pidhash_size; i++)
 		INIT_HLIST_HEAD(&pid_hash[i]);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d2186ecb36f..a13ded1938f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
+	max = min(max, 0x80000000ULL);
 
 	if (numentries > max)
 		numentries = max;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 37755ccc0e9..22ef5f9fd2f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3240,7 +3240,8 @@ void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
 	unsigned long limit;
-	int i, max_share, cnt;
+	int max_share, cnt;
+	unsigned int i;
 	unsigned long jiffy = jiffies;
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3283,7 +3284,7 @@ void __init tcp_init(void)
 					&tcp_hashinfo.bhash_size,
 					NULL,
 					64 * 1024);
-	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
-- 
cgit v1.2.3-70-g09d2


From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Feb 2012 20:07:11 +0100
Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree()

This patch is intentionally incomplete to simplify the review.
It ignores ep_unregister_pollwait() which plays with the same wqh.
See the next change.

epoll assumes that the EPOLL_CTL_ADD'ed file controls everything
f_op->poll() needs. In particular it assumes that the wait queue
can't go away until eventpoll_release(). This is not true in case
of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand
which is not connected to the file.

This patch adds the special event, POLLFREE, currently only for
epoll. It expects that init_poll_funcptr()'ed hook should do the
necessary cleanup. Perhaps it should be defined as EPOLLFREE in
eventpoll.

__cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if
->signalfd_wqh is not empty, we add the new signalfd_cleanup()
helper.

ep_poll_callback(POLLFREE) simply does list_del_init(task_list).
This make this poll entry inconsistent, but we don't care. If you
share epoll fd which contains our sigfd with another process you
should blame yourself. signalfd is "really special". I simply do
not know how we can define the "right" semantics if it used with
epoll.

The main problem is, epoll calls signalfd_poll() once to establish
the connection with the wait queue, after that signalfd_poll(NULL)
returns the different/inconsistent results depending on who does
EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd
has nothing to do with the file, it works with the current thread.

In short: this patch is the hack which tries to fix the symptoms.
It also assumes that nobody can take tasklist_lock under epoll
locks, this seems to be true.

Note:

	- we do not have wake_up_all_poll() but wake_up_poll()
	  is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE.

	- signalfd_cleanup() uses POLLHUP along with POLLFREE,
	  we need a couple of simple changes in eventpoll.c to
	  make sure it can't be "lost".

Reported-by: Maxime Bizon <mbizon@freebox.fr>
Cc: <stable@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c             |  4 ++++
 fs/signalfd.c              | 11 +++++++++++
 include/asm-generic/poll.h |  2 ++
 include/linux/signalfd.h   |  5 ++++-
 kernel/fork.c              |  5 ++++-
 5 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aabdfc38cf2..34bbfc6dd8d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
 
+	/* the caller holds eppoll_entry->whead->lock */
+	if ((unsigned long)key & POLLFREE)
+		list_del_init(&wait->task_list);
+
 	spin_lock_irqsave(&ep->lock, flags);
 
 	/*
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 492465b451d..79c1eea98a3 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -30,6 +30,17 @@
 #include <linux/signalfd.h>
 #include <linux/syscalls.h>
 
+void signalfd_cleanup(struct sighand_struct *sighand)
+{
+	wait_queue_head_t *wqh = &sighand->signalfd_wqh;
+
+	if (likely(!waitqueue_active(wqh)))
+		return;
+
+	/* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+	wake_up_poll(wqh, POLLHUP | POLLFREE);
+}
+
 struct signalfd_ctx {
 	sigset_t sigmask;
 };
diff --git a/include/asm-generic/poll.h b/include/asm-generic/poll.h
index 44bce836d35..9ce7f44aebd 100644
--- a/include/asm-generic/poll.h
+++ b/include/asm-generic/poll.h
@@ -28,6 +28,8 @@
 #define POLLRDHUP       0x2000
 #endif
 
+#define POLLFREE	0x4000	/* currently only for epoll */
+
 struct pollfd {
 	int fd;
 	short events;
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index 3ff4961da9b..247399b2979 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -61,13 +61,16 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig)
 		wake_up(&tsk->sighand->signalfd_wqh);
 }
 
+extern void signalfd_cleanup(struct sighand_struct *sighand);
+
 #else /* CONFIG_SIGNALFD */
 
 static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
 
+static inline void signalfd_cleanup(struct sighand_struct *sighand) { }
+
 #endif /* CONFIG_SIGNALFD */
 
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SIGNALFD_H */
-
diff --git a/kernel/fork.c b/kernel/fork.c
index b77fd559c78..e2cd3e2a5ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
+#include <linux/signalfd.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 
 void __cleanup_sighand(struct sighand_struct *sighand)
 {
-	if (atomic_dec_and_test(&sighand->count))
+	if (atomic_dec_and_test(&sighand->count)) {
+		signalfd_cleanup(sighand);
 		kmem_cache_free(sighand_cachep, sighand);
+	}
 }
 
 
-- 
cgit v1.2.3-70-g09d2