From 1b614fb9a00e97b1eab54d4e442d405229c059dd Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 8 Jul 2009 20:09:44 -0700 Subject: netpoll: Fix carrier detection for drivers that are using phylib Using early netconsole and gianfar driver this error pops up: netconsole: timeout waiting for carrier It appears that net/core/netpoll.c:netpoll_setup() is using cond_resched() in a loop waiting for a carrier. The thing is that cond_resched() is a no-op when system_state != SYSTEM_RUNNING, and so drivers/net/phy/phy.c's state_queue is never scheduled, therefore link detection doesn't work. I belive that the main problem is in cond_resched()[1], but despite how the cond_resched() story ends, it might be a good idea to call msleep(1) instead of cond_resched(), as suggested by Andrew Morton. [1] http://lkml.org/lkml/2009/7/7/463 Signed-off-by: Anton Vorontsov Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9675f312830..df30feb2fc7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -740,7 +740,7 @@ int netpoll_setup(struct netpoll *np) np->name); break; } - cond_resched(); + msleep(1); } /* If carrier appears to come up instantly, we don't -- cgit v1.2.3-18-g5258 From a57de0b4336e48db2811a2030bb68dba8dd09d88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Jul 2009 12:09:13 +0000 Subject: net: adding memory barrier to the poll and receive callbacks Adding memory barrier after the poll_wait function, paired with receive callbacks. Adding fuctions sock_poll_wait and sk_has_sleeper to wrap the memory barrier. Without the memory barrier, following race can happen. The race fires, when following code paths meet, and the tp->rcv_nxt and __add_wait_queue updates stay in CPU caches. CPU1 CPU2 sys_select receive packet ... ... __add_wait_queue update tp->rcv_nxt ... ... tp->rcv_nxt check sock_def_readable ... { schedule ... if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep) ... } If there was no cache the code would work ok, since the wait_queue and rcv_nxt are opposit to each other. Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already passed the tp->rcv_nxt check and sleeps, or will get the new value for tp->rcv_nxt and will return with new data mask. In both cases the process (CPU1) is being added to the wait queue, so the waitqueue_active (CPU2) call cannot miss and will wake up CPU1. The bad case is when the __add_wait_queue changes done by CPU1 stay in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then endup calling schedule and sleep forever if there are no more data on the socket. Calls to poll_wait in following modules were ommited: net/bluetooth/af_bluetooth.c net/irda/af_irda.c net/irda/irnet/irnet_ppp.c net/mac80211/rc80211_pid_debugfs.c net/phonet/socket.c net/rds/af_rds.c net/rfkill/core.c net/sunrpc/cache.c net/sunrpc/rpc_pipe.c net/tipc/socket.c Signed-off-by: Jiri Olsa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- net/core/sock.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 58abee1f1df..b0fe69211ee 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -712,7 +712,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/sock.c b/net/core/sock.c index b0ba569bc97..6354863b1c6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_all(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk) static void sock_def_error_report(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_poll(sk->sk_sleep, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk) static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk) * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | POLLWRNORM | POLLWRBAND); -- cgit v1.2.3-18-g5258 From e912b1142be8f1e2c71c71001dc992c6e5eb2ec1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jul 2009 19:36:05 +0000 Subject: net: sk_prot_alloc() should not blindly overwrite memory Some sockets use SLAB_DESTROY_BY_RCU, and our RCU code correctness depends on sk->sk_nulls_node.next being always valid. A NULL value is not allowed as it might fault a lockless reader. Current sk_prot_alloc() implementation doesnt respect this hypothesis, calling kmem_cache_alloc() with __GFP_ZERO. Just call memset() around the forbidden field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 6354863b1c6..ba5d2116aea 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -939,8 +939,23 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, struct kmem_cache *slab; slab = prot->slab; - if (slab != NULL) - sk = kmem_cache_alloc(slab, priority); + if (slab != NULL) { + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; + if (priority & __GFP_ZERO) { + /* + * caches using SLAB_DESTROY_BY_RCU should let + * sk_node.next un-modified. Special care is taken + * when initializing object to zero. + */ + if (offsetof(struct sock, sk_node.next) != 0) + memset(sk, 0, offsetof(struct sock, sk_node.next)); + memset(&sk->sk_node.pprev, 0, + prot->obj_size - offsetof(struct sock, + sk_node.pprev)); + } + } else sk = kmalloc(prot->obj_size, priority); -- cgit v1.2.3-18-g5258 From 4dc6dc7162c08b9965163c9ab3f9375d4adff2c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Jul 2009 23:13:10 +0000 Subject: net: sock_copy() fixes Commit e912b1142be8f1e2c71c71001dc992c6e5eb2ec1 (net: sk_prot_alloc() should not blindly overwrite memory) took care of not zeroing whole new socket at allocation time. sock_copy() is another spot where we should be very careful. We should not set refcnt to a non null value, until we are sure other fields are correctly setup, or a lockless reader could catch this socket by mistake, while not fully (re)initialized. This patch puts sk_node & sk_refcnt to the very beginning of struct sock to ease sock_copy() & sk_prot_alloc() job. We add appropriate smp_wmb() before sk_refcnt initializations to match our RCU requirements (changes to sock keys should be committed to memory before sk_refcnt setting) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index ba5d2116aea..d9eec153d53 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -919,13 +919,19 @@ static inline void sock_lock_init(struct sock *sk) af_family_keys + sk->sk_family); } +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + */ static void sock_copy(struct sock *nsk, const struct sock *osk) { #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif - - memcpy(nsk, osk, osk->sk_prot->obj_size); + BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != + sizeof(osk->sk_node) + sizeof(osk->sk_refcnt)); + memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, + osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); @@ -1140,6 +1146,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&newsk->sk_refcnt, 2); /* @@ -1855,6 +1866,11 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_wmem_alloc, 1); atomic_set(&sk->sk_drops, 0); -- cgit v1.2.3-18-g5258 From f249fb783092471a4808e5fc5bda071d2724810d Mon Sep 17 00:00:00 2001 From: Rémi Denis-Courmont Date: Mon, 20 Jul 2009 00:47:04 +0000 Subject: Fix error return for setsockopt(SO_TIMESTAMPING) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I guess it should be -EINVAL rather than EINVAL. I have not checked when the bug came in. Perhaps a candidate for -stable? Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index d9eec153d53..bbb25be7ddf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -631,7 +631,7 @@ set_rcvbuf: case SO_TIMESTAMPING: if (val & ~SOF_TIMESTAMPING_MASK) { - ret = EINVAL; + ret = -EINVAL; break; } sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, -- cgit v1.2.3-18-g5258 From 144586301f6af5ae5943a002f030d8c626fa4fdd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Jul 2009 02:36:15 +0000 Subject: net: net_assign_generic() fix memcpy() should take into account size of pointers, not only number of pointers to copy. Signed-off-by: Eric Dumazet Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b7292a2719d..197283072cc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -488,7 +488,7 @@ int net_assign_generic(struct net *net, int id, void *data) */ ng->len = id; - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len); + memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); rcu_assign_pointer(net->gen, ng); call_rcu(&old_ng->rcu, net_generic_release); -- cgit v1.2.3-18-g5258 From a6ac65db2329e7685299666f5f7b6093c7b0f3a0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 30 Jul 2009 01:06:12 +0000 Subject: net: restore the original spinlock to protect unicast list There is a path when an assetion in dev_unicast_sync() appears. igmp6_group_added -> dev_mc_add -> __dev_set_rx_mode -> -> vlan_dev_set_rx_mode -> dev_unicast_sync Therefore we cannot protect this list with rtnl. This patch restores the original protecting this list with spinlock. Signed-off-by: Jiri Pirko Tested-by: Meelis Roos Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 70c27e0c7c3..43e61ba7bd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3865,10 +3865,12 @@ int dev_unicast_delete(struct net_device *dev, void *addr) ASSERT_RTNL(); + netif_addr_lock_bh(dev); err = __hw_addr_del(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_delete); @@ -3889,10 +3891,12 @@ int dev_unicast_add(struct net_device *dev, void *addr) ASSERT_RTNL(); + netif_addr_lock_bh(dev); err = __hw_addr_add(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_add); @@ -3949,7 +3953,8 @@ void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, * @from: source device * * Add newly added addresses to the destination device and release - * addresses that have no users left. + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. @@ -3958,14 +3963,14 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from) { int err = 0; - ASSERT_RTNL(); - if (to->addr_len != from->addr_len) return -EINVAL; + netif_addr_lock_bh(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); return err; } EXPORT_SYMBOL(dev_unicast_sync); @@ -3981,28 +3986,30 @@ EXPORT_SYMBOL(dev_unicast_sync); */ void dev_unicast_unsync(struct net_device *to, struct net_device *from) { - ASSERT_RTNL(); - if (to->addr_len != from->addr_len) return; + netif_addr_lock_bh(from); + netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_unicast_unsync); static void dev_unicast_flush(struct net_device *dev) { - /* rtnl_mutex must be held here */ - + netif_addr_lock_bh(dev); __hw_addr_flush(&dev->uc); + netif_addr_unlock_bh(dev); } static void dev_unicast_init(struct net_device *dev) { - /* rtnl_mutex must be held here */ - + netif_addr_lock_bh(dev); __hw_addr_init(&dev->uc); + netif_addr_unlock_bh(dev); } -- cgit v1.2.3-18-g5258 From 0bf52b981770cbf006323bab5177f2858a196766 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Aug 2009 21:16:58 +0000 Subject: net: Fix spinlock use in alloc_netdev_mq() -tip testing found this lockdep warning: [ 2.272010] calling net_dev_init+0x0/0x164 @ 1 [ 2.276033] device class 'net': registering [ 2.280191] INFO: trying to register non-static key. [ 2.284005] the code is fine but needs lockdep annotation. [ 2.284005] turning off the locking correctness validator. [ 2.284005] Pid: 1, comm: swapper Not tainted 2.6.31-rc5-tip #1145 [ 2.284005] Call Trace: [ 2.284005] [<7958eb4e>] ? printk+0xf/0x11 [ 2.284005] [<7904f83c>] __lock_acquire+0x11b/0x622 [ 2.284005] [<7908c9b7>] ? alloc_debug_processing+0xf9/0x144 [ 2.284005] [<7904e2be>] ? mark_held_locks+0x3a/0x52 [ 2.284005] [<7908dbc4>] ? kmem_cache_alloc+0xa8/0x13f [ 2.284005] [<7904e475>] ? trace_hardirqs_on_caller+0xa2/0xc3 [ 2.284005] [<7904fdf6>] lock_acquire+0xb3/0xd0 [ 2.284005] [<79489678>] ? alloc_netdev_mq+0xf5/0x1ad [ 2.284005] [<79591514>] _spin_lock_bh+0x2d/0x5d [ 2.284005] [<79489678>] ? alloc_netdev_mq+0xf5/0x1ad [ 2.284005] [<79489678>] alloc_netdev_mq+0xf5/0x1ad [ 2.284005] [<793a38f2>] ? loopback_setup+0x0/0x74 [ 2.284005] [<798eecd0>] loopback_net_init+0x20/0x5d [ 2.284005] [<79483efb>] register_pernet_device+0x23/0x4b [ 2.284005] [<798f5c9f>] net_dev_init+0x115/0x164 [ 2.284005] [<7900104f>] do_one_initcall+0x4a/0x11a [ 2.284005] [<798f5b8a>] ? net_dev_init+0x0/0x164 [ 2.284005] [<79066f6d>] ? register_irq_proc+0x8c/0xa8 [ 2.284005] [<798cc29a>] do_basic_setup+0x42/0x52 [ 2.284005] [<798cc30a>] kernel_init+0x60/0xa1 [ 2.284005] [<798cc2aa>] ? kernel_init+0x0/0xa1 [ 2.284005] [<79003e03>] kernel_thread_helper+0x7/0x10 [ 2.284078] device: 'lo': device_add [ 2.288248] initcall net_dev_init+0x0/0x164 returned 0 after 11718 usecs [ 2.292010] calling neigh_init+0x0/0x66 @ 1 [ 2.296010] initcall neigh_init+0x0/0x66 returned 0 after 0 usecs it's using an zero-initialized spinlock. This is a side-effect of: dev_unicast_init(dev); in alloc_netdev_mq() making use of dev->addr_list_lock. The device has just been allocated freshly, it's not accessible anywhere yet so no locking is needed at all - in fact it's wrong to lock it here (the lock isnt initialized yet). This bug was introduced via: | commit a6ac65db2329e7685299666f5f7b6093c7b0f3a0 | Date: Thu Jul 30 01:06:12 2009 +0000 | | net: restore the original spinlock to protect unicast list Signed-off-by: Ingo Molnar Acked-by: Jiri Pirko Tested-by: Mark Brown Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 43e61ba7bd9..6a94475aee8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4007,9 +4007,7 @@ static void dev_unicast_flush(struct net_device *dev) static void dev_unicast_init(struct net_device *dev) { - netif_addr_lock_bh(dev); __hw_addr_init(&dev->uc); - netif_addr_unlock_bh(dev); } -- cgit v1.2.3-18-g5258