From 1b614fb9a00e97b1eab54d4e442d405229c059dd Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 8 Jul 2009 20:09:44 -0700
Subject: netpoll: Fix carrier detection for drivers that are using phylib

Using early netconsole and gianfar driver this error pops up:

  netconsole: timeout waiting for carrier

It appears that net/core/netpoll.c:netpoll_setup() is using
cond_resched() in a loop waiting for a carrier.

The thing is that cond_resched() is a no-op when system_state !=
SYSTEM_RUNNING, and so drivers/net/phy/phy.c's state_queue is never
scheduled, therefore link detection doesn't work.

I belive that the main problem is in cond_resched()[1], but despite
how the cond_resched() story ends, it might be a good idea to call
msleep(1) instead of cond_resched(), as suggested by Andrew Morton.

[1] http://lkml.org/lkml/2009/7/7/463

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 9675f312830..df30feb2fc7 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -740,7 +740,7 @@ int netpoll_setup(struct netpoll *np)
 				       np->name);
 				break;
 			}
-			cond_resched();
+			msleep(1);
 		}
 
 		/* If carrier appears to come up instantly, we don't
-- 
cgit v1.2.3-18-g5258


From a57de0b4336e48db2811a2030bb68dba8dd09d88 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 8 Jul 2009 12:09:13 +0000
Subject: net: adding memory barrier to the poll and receive callbacks

Adding memory barrier after the poll_wait function, paired with
receive callbacks. Adding fuctions sock_poll_wait and sk_has_sleeper
to wrap the memory barrier.

Without the memory barrier, following race can happen.
The race fires, when following code paths meet, and the tp->rcv_nxt
and __add_wait_queue updates stay in CPU caches.

CPU1                         CPU2

sys_select                   receive packet
  ...                        ...
  __add_wait_queue           update tp->rcv_nxt
  ...                        ...
  tp->rcv_nxt check          sock_def_readable
  ...                        {
  schedule                      ...
                                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                                        wake_up_interruptible(sk->sk_sleep)
                                ...
                             }

If there was no cache the code would work ok, since the wait_queue and
rcv_nxt are opposit to each other.

Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
passed the tp->rcv_nxt check and sleeps, or will get the new value for
tp->rcv_nxt and will return with new data mask.
In both cases the process (CPU1) is being added to the wait queue, so the
waitqueue_active (CPU2) call cannot miss and will wake up CPU1.

The bad case is when the __add_wait_queue changes done by CPU1 stay in its
cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1 will then
endup calling schedule and sleep forever if there are no more data on the
socket.

Calls to poll_wait in following modules were ommited:
	net/bluetooth/af_bluetooth.c
	net/irda/af_irda.c
	net/irda/irnet/irnet_ppp.c
	net/mac80211/rc80211_pid_debugfs.c
	net/phonet/socket.c
	net/rds/af_rds.c
	net/rfkill/core.c
	net/sunrpc/cache.c
	net/sunrpc/rpc_pipe.c
	net/tipc/socket.c

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 2 +-
 net/core/sock.c     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 58abee1f1df..b0fe69211ee 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -712,7 +712,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	unsigned int mask;
 
-	poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk->sk_sleep, wait);
 	mask = 0;
 
 	/* exceptional events? */
diff --git a/net/core/sock.c b/net/core/sock.c
index b0ba569bc97..6354863b1c6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage);
 static void sock_def_wakeup(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+	if (sk_has_sleeper(sk))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk)
 static void sock_def_error_report(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+	if (sk_has_sleeper(sk))
 		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
 	read_unlock(&sk->sk_callback_lock);
@@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk)
 static void sock_def_readable(struct sock *sk, int len)
 {
 	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+	if (sk_has_sleeper(sk))
 		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk)
 	 * progress.  --DaveM
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
-		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+		if (sk_has_sleeper(sk))
 			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
-- 
cgit v1.2.3-18-g5258


From e912b1142be8f1e2c71c71001dc992c6e5eb2ec1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 8 Jul 2009 19:36:05 +0000
Subject: net: sk_prot_alloc() should not blindly overwrite memory

Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code correctness
depends on sk->sk_nulls_node.next being always valid. A NULL
value is not allowed as it might fault a lockless reader.

Current sk_prot_alloc() implementation doesnt respect this hypothesis,
calling kmem_cache_alloc() with __GFP_ZERO. Just call memset() around
the forbidden field.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 6354863b1c6..ba5d2116aea 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -939,8 +939,23 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 	struct kmem_cache *slab;
 
 	slab = prot->slab;
-	if (slab != NULL)
-		sk = kmem_cache_alloc(slab, priority);
+	if (slab != NULL) {
+		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+		if (!sk)
+			return sk;
+		if (priority & __GFP_ZERO) {
+			/*
+			 * caches using SLAB_DESTROY_BY_RCU should let
+			 * sk_node.next un-modified. Special care is taken
+			 * when initializing object to zero.
+			 */
+			if (offsetof(struct sock, sk_node.next) != 0)
+				memset(sk, 0, offsetof(struct sock, sk_node.next));
+			memset(&sk->sk_node.pprev, 0,
+			       prot->obj_size - offsetof(struct sock,
+							 sk_node.pprev));
+		}
+	}
 	else
 		sk = kmalloc(prot->obj_size, priority);
 
-- 
cgit v1.2.3-18-g5258


From 4dc6dc7162c08b9965163c9ab3f9375d4adff2c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 15 Jul 2009 23:13:10 +0000
Subject: net: sock_copy() fixes

Commit e912b1142be8f1e2c71c71001dc992c6e5eb2ec1
(net: sk_prot_alloc() should not blindly overwrite memory)
took care of not zeroing whole new socket at allocation time.

sock_copy() is another spot where we should be very careful.
We should not set refcnt to a non null value, until
we are sure other fields are correctly setup, or
a lockless reader could catch this socket by mistake,
while not fully (re)initialized.

This patch puts sk_node & sk_refcnt to the very beginning
of struct sock to ease sock_copy() & sk_prot_alloc() job.

We add appropriate smp_wmb() before sk_refcnt initializations
to match our RCU requirements (changes to sock keys should
be committed to memory before sk_refcnt setting)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index ba5d2116aea..d9eec153d53 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -919,13 +919,19 @@ static inline void sock_lock_init(struct sock *sk)
 			af_family_keys + sk->sk_family);
 }
 
+/*
+ * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
+ * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ */
 static void sock_copy(struct sock *nsk, const struct sock *osk)
 {
 #ifdef CONFIG_SECURITY_NETWORK
 	void *sptr = nsk->sk_security;
 #endif
-
-	memcpy(nsk, osk, osk->sk_prot->obj_size);
+	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
+		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
+	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
+	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
 #ifdef CONFIG_SECURITY_NETWORK
 	nsk->sk_security = sptr;
 	security_sk_clone(osk, nsk);
@@ -1140,6 +1146,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
+		/*
+		 * Before updating sk_refcnt, we must commit prior changes to memory
+		 * (Documentation/RCU/rculist_nulls.txt for details)
+		 */
+		smp_wmb();
 		atomic_set(&newsk->sk_refcnt, 2);
 
 		/*
@@ -1855,6 +1866,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
+	/*
+	 * Before updating sk_refcnt, we must commit prior changes to memory
+	 * (Documentation/RCU/rculist_nulls.txt for details)
+	 */
+	smp_wmb();
 	atomic_set(&sk->sk_refcnt, 1);
 	atomic_set(&sk->sk_wmem_alloc, 1);
 	atomic_set(&sk->sk_drops, 0);
-- 
cgit v1.2.3-18-g5258


From f249fb783092471a4808e5fc5bda071d2724810d Mon Sep 17 00:00:00 2001
From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 20 Jul 2009 00:47:04 +0000
Subject: Fix error return for setsockopt(SO_TIMESTAMPING)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I guess it should be -EINVAL rather than EINVAL. I have not checked
when the bug came in. Perhaps a candidate for -stable?

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index d9eec153d53..bbb25be7ddf 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -631,7 +631,7 @@ set_rcvbuf:
 
 	case SO_TIMESTAMPING:
 		if (val & ~SOF_TIMESTAMPING_MASK) {
-			ret = EINVAL;
+			ret = -EINVAL;
 			break;
 		}
 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
-- 
cgit v1.2.3-18-g5258


From 144586301f6af5ae5943a002f030d8c626fa4fdd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 28 Jul 2009 02:36:15 +0000
Subject: net: net_assign_generic() fix

memcpy() should take into account size of pointers,
not only number of pointers to copy.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b7292a2719d..197283072cc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -488,7 +488,7 @@ int net_assign_generic(struct net *net, int id, void *data)
 	 */
 
 	ng->len = id;
-	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len);
+	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
 
 	rcu_assign_pointer(net->gen, ng);
 	call_rcu(&old_ng->rcu, net_generic_release);
-- 
cgit v1.2.3-18-g5258


From a6ac65db2329e7685299666f5f7b6093c7b0f3a0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 30 Jul 2009 01:06:12 +0000
Subject: net: restore the original spinlock to protect unicast list

There is a path when an assetion in dev_unicast_sync() appears.

igmp6_group_added -> dev_mc_add -> __dev_set_rx_mode ->
-> vlan_dev_set_rx_mode -> dev_unicast_sync

Therefore we cannot protect this list with rtnl. This patch restores the
original protecting this list with spinlock.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Tested-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 70c27e0c7c3..43e61ba7bd9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3865,10 +3865,12 @@ int dev_unicast_delete(struct net_device *dev, void *addr)
 
 	ASSERT_RTNL();
 
+	netif_addr_lock_bh(dev);
 	err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
 			    NETDEV_HW_ADDR_T_UNICAST);
 	if (!err)
 		__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_delete);
@@ -3889,10 +3891,12 @@ int dev_unicast_add(struct net_device *dev, void *addr)
 
 	ASSERT_RTNL();
 
+	netif_addr_lock_bh(dev);
 	err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
 			    NETDEV_HW_ADDR_T_UNICAST);
 	if (!err)
 		__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_add);
@@ -3949,7 +3953,8 @@ void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
  *	@from: source device
  *
  *	Add newly added addresses to the destination device and release
- *	addresses that have no users left.
+ *	addresses that have no users left. The source device must be
+ *	locked by netif_tx_lock_bh.
  *
  *	This function is intended to be called from the dev->set_rx_mode
  *	function of layered software devices.
@@ -3958,14 +3963,14 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)
 {
 	int err = 0;
 
-	ASSERT_RTNL();
-
 	if (to->addr_len != from->addr_len)
 		return -EINVAL;
 
+	netif_addr_lock_bh(to);
 	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
 	if (!err)
 		__dev_set_rx_mode(to);
+	netif_addr_unlock_bh(to);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_sync);
@@ -3981,28 +3986,30 @@ EXPORT_SYMBOL(dev_unicast_sync);
  */
 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
 {
-	ASSERT_RTNL();
-
 	if (to->addr_len != from->addr_len)
 		return;
 
+	netif_addr_lock_bh(from);
+	netif_addr_lock(to);
 	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
 	__dev_set_rx_mode(to);
+	netif_addr_unlock(to);
+	netif_addr_unlock_bh(from);
 }
 EXPORT_SYMBOL(dev_unicast_unsync);
 
 static void dev_unicast_flush(struct net_device *dev)
 {
-	/* rtnl_mutex must be held here */
-
+	netif_addr_lock_bh(dev);
 	__hw_addr_flush(&dev->uc);
+	netif_addr_unlock_bh(dev);
 }
 
 static void dev_unicast_init(struct net_device *dev)
 {
-	/* rtnl_mutex must be held here */
-
+	netif_addr_lock_bh(dev);
 	__hw_addr_init(&dev->uc);
+	netif_addr_unlock_bh(dev);
 }
 
 
-- 
cgit v1.2.3-18-g5258


From 0bf52b981770cbf006323bab5177f2858a196766 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 4 Aug 2009 21:16:58 +0000
Subject: net: Fix spinlock use in alloc_netdev_mq()

-tip testing found this lockdep warning:

[    2.272010] calling  net_dev_init+0x0/0x164 @ 1
[    2.276033] device class 'net': registering
[    2.280191] INFO: trying to register non-static key.
[    2.284005] the code is fine but needs lockdep annotation.
[    2.284005] turning off the locking correctness validator.
[    2.284005] Pid: 1, comm: swapper Not tainted 2.6.31-rc5-tip #1145
[    2.284005] Call Trace:
[    2.284005]  [<7958eb4e>] ? printk+0xf/0x11
[    2.284005]  [<7904f83c>] __lock_acquire+0x11b/0x622
[    2.284005]  [<7908c9b7>] ? alloc_debug_processing+0xf9/0x144
[    2.284005]  [<7904e2be>] ? mark_held_locks+0x3a/0x52
[    2.284005]  [<7908dbc4>] ? kmem_cache_alloc+0xa8/0x13f
[    2.284005]  [<7904e475>] ? trace_hardirqs_on_caller+0xa2/0xc3
[    2.284005]  [<7904fdf6>] lock_acquire+0xb3/0xd0
[    2.284005]  [<79489678>] ? alloc_netdev_mq+0xf5/0x1ad
[    2.284005]  [<79591514>] _spin_lock_bh+0x2d/0x5d
[    2.284005]  [<79489678>] ? alloc_netdev_mq+0xf5/0x1ad
[    2.284005]  [<79489678>] alloc_netdev_mq+0xf5/0x1ad
[    2.284005]  [<793a38f2>] ? loopback_setup+0x0/0x74
[    2.284005]  [<798eecd0>] loopback_net_init+0x20/0x5d
[    2.284005]  [<79483efb>] register_pernet_device+0x23/0x4b
[    2.284005]  [<798f5c9f>] net_dev_init+0x115/0x164
[    2.284005]  [<7900104f>] do_one_initcall+0x4a/0x11a
[    2.284005]  [<798f5b8a>] ? net_dev_init+0x0/0x164
[    2.284005]  [<79066f6d>] ? register_irq_proc+0x8c/0xa8
[    2.284005]  [<798cc29a>] do_basic_setup+0x42/0x52
[    2.284005]  [<798cc30a>] kernel_init+0x60/0xa1
[    2.284005]  [<798cc2aa>] ? kernel_init+0x0/0xa1
[    2.284005]  [<79003e03>] kernel_thread_helper+0x7/0x10
[    2.284078] device: 'lo': device_add
[    2.288248] initcall net_dev_init+0x0/0x164 returned 0 after 11718 usecs
[    2.292010] calling  neigh_init+0x0/0x66 @ 1
[    2.296010] initcall neigh_init+0x0/0x66 returned 0 after 0 usecs

it's using an zero-initialized spinlock. This is a side-effect of:

        dev_unicast_init(dev);

in alloc_netdev_mq() making use of dev->addr_list_lock.

The device has just been allocated freshly, it's not accessible
anywhere yet so no locking is needed at all - in fact it's wrong
to lock it here (the lock isnt initialized yet).

This bug was introduced via:

| commit a6ac65db2329e7685299666f5f7b6093c7b0f3a0
| Date:   Thu Jul 30 01:06:12 2009 +0000
|
|     net: restore the original spinlock to protect unicast list

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jiri Pirko <jpirko@redhat.com>
Tested-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 43e61ba7bd9..6a94475aee8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4007,9 +4007,7 @@ static void dev_unicast_flush(struct net_device *dev)
 
 static void dev_unicast_init(struct net_device *dev)
 {
-	netif_addr_lock_bh(dev);
 	__hw_addr_init(&dev->uc);
-	netif_addr_unlock_bh(dev);
 }
 
 
-- 
cgit v1.2.3-18-g5258