From efcdbf24fd5daa88060869e51ed49f68b7ac8708 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Mon, 30 Jan 2012 14:16:06 -0800 Subject: net: Disambiguate kernel message Some of our machines were reporting: TCP: too many of orphaned sockets even when the number of orphaned sockets was well below the limit. We print a different message depending on whether we're out of TCP memory or there are too many orphaned sockets. Also move the check out of line and cleanup the messages that were printed. Signed-off-by: Arun Sharma Suggested-by: Mohan Srinivasan Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: David Miller Cc: Glauber Costa Cc: Ingo Molnar Cc: Joe Perches Signed-off-by: David S. Miller --- include/net/tcp.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d49db0113a0..42c29bfbcee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -273,6 +273,14 @@ static inline int between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } +static inline bool tcp_out_of_memory(struct sock *sk) +{ + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) + return true; + return false; +} + static inline bool tcp_too_many_orphans(struct sock *sk, int shift) { struct percpu_counter *ocp = sk->sk_prot->orphan_count; @@ -283,13 +291,11 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) if (orphans << shift > sysctl_tcp_max_orphans) return true; } - - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; return false; } +extern bool tcp_check_oom(struct sock *sk, int shift); + /* syncookies: remember time of last synqueue overflow */ static inline void tcp_synq_overflow(struct sock *sk) { -- cgit v1.2.3-70-g09d2 From e6b45241c57a83197e5de9166b3b0d32ac562609 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 4 Feb 2012 13:04:46 +0000 Subject: ipv4: reset flowi parameters on route connect Eric Dumazet found that commit 813b3b5db83 (ipv4: Use caller's on-stack flowi as-is in output route lookups.) that comes in 3.0 added a regression. The problem appears to be that resulting flowi4_oif is used incorrectly as input parameter to some routing lookups. The result is that when connecting to local port without listener if the IP address that is used is not on a loopback interface we incorrectly assign RTN_UNICAST to the output route because no route is matched by oif=lo. The RST packet can not be sent immediately by tcp_v4_send_reset because it expects RTN_LOCAL. So, change ip_route_connect and ip_route_newports to update the flowi4 fields that are input parameters because we do not want unnecessary binding to oif. To make it clear what are the input parameters that can be modified during lookup and to show which fields of floiw4 are reused add a new function to update the flowi4 structure: flowi4_update_output. Thanks to Yurij M. Plotnikov for providing a bug report including a program to reproduce the problem. Thanks to Eric Dumazet for tracking the problem down to tcp_v4_send_reset and providing initial fix. Reported-by: Yurij M. Plotnikov Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/flow.h | 10 ++++++++++ include/net/route.h | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 9b582437fbe..6c469dbdb91 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -93,6 +93,16 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->fl4_dport = dport; fl4->fl4_sport = sport; } + +/* Reset some input parameters after previous lookup */ +static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, + __be32 daddr, __be32 saddr) +{ + fl4->flowi4_oif = oif; + fl4->flowi4_tos = tos; + fl4->daddr = daddr; + fl4->saddr = saddr; +} struct flowi6 { diff --git a/include/net/route.h b/include/net/route.h index 91855d185b5..b1c0d5b564c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -270,6 +270,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, if (IS_ERR(rt)) return rt; ip_rt_put(rt); + flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -284,6 +285,9 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable fl4->fl4_dport = dport; fl4->fl4_sport = sport; ip_rt_put(rt); + flowi4_update_output(fl4, sk->sk_bound_dev_if, + RT_CONN_FLAGS(sk), fl4->daddr, + fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); } -- cgit v1.2.3-70-g09d2 From 16bda13d90c8d5da243e2cfa1677e62ecce26860 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Feb 2012 15:14:37 -0500 Subject: net: Make qdisc_skb_cb upper size bound explicit. Just like skb->cb[], so that qdisc_skb_cb can be encapsulated inside of other data structures. This is intended to be used by IPoIB so that it can remember addressing information stored at hard_header_ops->create() time that it can fetch when the packet gets to the transmit routine. Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 ++++++++- net/sched/sch_choke.c | 3 +-- net/sched/sch_netem.c | 3 +-- net/sched/sch_sfb.c | 3 +-- net/sched/sch_sfq.c | 5 ++--- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f6bb08b73ca..55ce96b53b0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -220,9 +220,16 @@ struct tcf_proto { struct qdisc_skb_cb { unsigned int pkt_len; - long data[]; + unsigned char data[24]; }; +static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) +{ + struct qdisc_skb_cb *qcb; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); + BUILD_BUG_ON(sizeof(qcb->data) < sz); +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index e465064d39a..7e267d7b9c7 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -148,8 +148,7 @@ struct choke_skb_cb { static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2776012132e..e83d61ca78c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -130,8 +130,7 @@ struct netem_skb_cb { static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 96e42cae4c7..d7eea99333e 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -94,8 +94,7 @@ struct sfb_skb_cb { static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 67494aef9ac..60d47180f04 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -166,9 +166,8 @@ struct sfq_skb_cb { static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; + qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); + return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; } static unsigned int sfq_hash(const struct sfq_sched_data *q, -- cgit v1.2.3-70-g09d2 From 2b73bc65e2771372c818db7955709c8caedbf8b9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:38 +0000 Subject: netprio_cgroup: fix wrong memory access when NETPRIO_CGROUP=m When the netprio_cgroup module is not loaded, net_prio_subsys_id is -1, and so sock_update_prioidx() accesses cgroup_subsys array with negative index subsys[-1]. Make the code resembles cls_cgroup code, which is bug free. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- include/net/netprio_cgroup.h | 48 ++++++++++++++++++++++++++++++++++++-------- net/core/sock.c | 7 ++----- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 7b2d43139c8..d58fdec4759 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -37,19 +37,51 @@ extern int net_prio_subsys_id; extern void sock_update_netprioidx(struct sock *sk); -static inline struct cgroup_netprio_state - *task_netprio_state(struct task_struct *p) +#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) + +static inline u32 task_netprioidx(struct task_struct *p) { -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) - return container_of(task_subsys_state(p, net_prio_subsys_id), - struct cgroup_netprio_state, css); -#else - return NULL; -#endif + struct cgroup_netprio_state *state; + u32 idx; + + rcu_read_lock(); + state = container_of(task_subsys_state(p, net_prio_subsys_id), + struct cgroup_netprio_state, css); + idx = state->prioidx; + rcu_read_unlock(); + return idx; +} + +#elif IS_MODULE(CONFIG_NETPRIO_CGROUP) + +static inline u32 task_netprioidx(struct task_struct *p) +{ + struct cgroup_netprio_state *state; + int subsys_id; + u32 idx = 0; + + rcu_read_lock(); + subsys_id = rcu_dereference_index_check(net_prio_subsys_id, + rcu_read_lock_held()); + if (subsys_id >= 0) { + state = container_of(task_subsys_state(p, subsys_id), + struct cgroup_netprio_state, css); + idx = state->prioidx; + } + rcu_read_unlock(); + return idx; } #else +static inline u32 task_netprioidx(struct task_struct *p) +{ + return 0; +} + +#endif /* CONFIG_NETPRIO_CGROUP */ + +#else #define sock_update_netprioidx(sk) #endif diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c7..02f8dfe320b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1171,13 +1171,10 @@ EXPORT_SYMBOL(sock_update_classid); void sock_update_netprioidx(struct sock *sk) { - struct cgroup_netprio_state *state; if (in_interrupt()) return; - rcu_read_lock(); - state = task_netprio_state(current); - sk->sk_cgrp_prioidx = state ? state->prioidx : 0; - rcu_read_unlock(); + + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif -- cgit v1.2.3-70-g09d2 From 4aa832c27edf902130f8bace1d42cf22468823fa Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sun, 8 Jan 2012 22:51:16 +0200 Subject: Bluetooth: Remove bogus inline declaration from l2cap_chan_connect As reported by Dan Carpenter this function causes a Sparse warning and shouldn't be declared inline: include/net/bluetooth/l2cap.h:837:30 error: marked inline, but without a definition" Reported-by: Dan Carpenter Signed-off-by: Johan Hedberg Acked-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 2 +- net/bluetooth/l2cap_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 68f58915069..124f7cf45bd 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -834,7 +834,7 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid); struct l2cap_chan *l2cap_chan_create(struct sock *sk); void l2cap_chan_close(struct l2cap_chan *chan, int reason); void l2cap_chan_destroy(struct l2cap_chan *chan); -inline int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, +int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst); int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, u32 priority); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index faf0b11ac1d..980abdb3067 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1120,7 +1120,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr return c1; } -inline int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst) +int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst) { struct sock *sk = chan->sk; bdaddr_t *src = &bt_sk(sk)->src; -- cgit v1.2.3-70-g09d2 From 6e1da683f79a22fafaada62d547138daaa9e3456 Mon Sep 17 00:00:00 2001 From: Andrzej Kaczmarek Date: Wed, 4 Jan 2012 12:10:42 +0100 Subject: Bluetooth: l2cap_set_timer needs jiffies as timeout value After moving L2CAP timers to workqueues l2cap_set_timer expects timeout value to be specified in jiffies but constants defined in miliseconds are used. This makes timeouts unreliable when CONFIG_HZ is not set to 1000. __set_chan_timer macro still uses jiffies as input to avoid multiple conversions from/to jiffies for sk_sndtimeo value which is already specified in jiffies. Signed-off-by: Andrzej Kaczmarek Ackec-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/l2cap.h | 6 +++--- net/bluetooth/l2cap_core.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 124f7cf45bd..26486e182f5 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -626,13 +626,13 @@ static inline void l2cap_clear_timer(struct l2cap_chan *chan, #define __set_chan_timer(c, t) l2cap_set_timer(c, &c->chan_timer, (t)) #define __clear_chan_timer(c) l2cap_clear_timer(c, &c->chan_timer) #define __set_retrans_timer(c) l2cap_set_timer(c, &c->retrans_timer, \ - L2CAP_DEFAULT_RETRANS_TO); + msecs_to_jiffies(L2CAP_DEFAULT_RETRANS_TO)); #define __clear_retrans_timer(c) l2cap_clear_timer(c, &c->retrans_timer) #define __set_monitor_timer(c) l2cap_set_timer(c, &c->monitor_timer, \ - L2CAP_DEFAULT_MONITOR_TO); + msecs_to_jiffies(L2CAP_DEFAULT_MONITOR_TO)); #define __clear_monitor_timer(c) l2cap_clear_timer(c, &c->monitor_timer) #define __set_ack_timer(c) l2cap_set_timer(c, &chan->ack_timer, \ - L2CAP_DEFAULT_ACK_TO); + msecs_to_jiffies(L2CAP_DEFAULT_ACK_TO)); #define __clear_ack_timer(c) l2cap_clear_timer(c, &c->ack_timer) static inline int __seq_offset(struct l2cap_chan *chan, __u16 seq1, __u16 seq2) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 980abdb3067..dcccae04ae0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2970,7 +2970,8 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr default: sk->sk_err = ECONNRESET; - __set_chan_timer(chan, L2CAP_DISC_REJ_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_REJ_TIMEOUT)); l2cap_send_disconn_req(conn, chan, ECONNRESET); goto done; } @@ -4478,7 +4479,8 @@ static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt) if (encrypt == 0x00) { if (chan->sec_level == BT_SECURITY_MEDIUM) { __clear_chan_timer(chan); - __set_chan_timer(chan, L2CAP_ENC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_ENC_TIMEOUT)); } else if (chan->sec_level == BT_SECURITY_HIGH) l2cap_chan_close(chan, ECONNREFUSED); } else { @@ -4546,7 +4548,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) L2CAP_CONN_REQ, sizeof(req), &req); } else { __clear_chan_timer(chan); - __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_TIMEOUT)); } } else if (chan->state == BT_CONNECT2) { struct l2cap_conn_rsp rsp; @@ -4566,7 +4569,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } } else { l2cap_state_change(chan, BT_DISCONN); - __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_TIMEOUT)); res = L2CAP_CR_SEC_BLOCK; stat = L2CAP_CS_NO_INFO; } -- cgit v1.2.3-70-g09d2 From 331660637b4e5136602a98200a84f6b65ed8d5be Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Wed, 4 Jan 2012 11:57:17 -0300 Subject: Bluetooth: Fix using an absolute timeout on hci_conn_put() queue_delayed_work() expects a relative time for when that work should be scheduled. Signed-off-by: Vinicius Costa Gomes Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ea9231f4935..92b8fea553d 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -561,7 +561,7 @@ static inline void hci_conn_put(struct hci_conn *conn) } cancel_delayed_work_sync(&conn->disc_work); queue_delayed_work(conn->hdev->workqueue, - &conn->disc_work, jiffies + timeo); + &conn->disc_work, timeo); } } -- cgit v1.2.3-70-g09d2 From b5a30dda6598af216c070165ece6068f9f00f33a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sun, 22 Jan 2012 00:28:34 +0200 Subject: Bluetooth: silence lockdep warning Since bluetooth uses multiple protocols types, to avoid lockdep warnings, we need to use different lockdep classes (one for each protocol type). This is already done in bt_sock_create but it misses a couple of cases when new connections are created. This patch corrects that to fix the following warning: <4>[ 1864.732366] ======================================================= <4>[ 1864.733030] [ INFO: possible circular locking dependency detected ] <4>[ 1864.733544] 3.0.16-mid3-00007-gc9a0f62 #3 <4>[ 1864.733883] ------------------------------------------------------- <4>[ 1864.734408] t.android.btclc/4204 is trying to acquire lock: <4>[ 1864.734869] (rfcomm_mutex){+.+.+.}, at: [] rfcomm_dlc_close+0x15/0x30 <4>[ 1864.735541] <4>[ 1864.735549] but task is already holding lock: <4>[ 1864.736045] (sk_lock-AF_BLUETOOTH){+.+.+.}, at: [] lock_sock+0xa/0xc <4>[ 1864.736732] <4>[ 1864.736740] which lock already depends on the new lock. <4>[ 1864.736750] <4>[ 1864.737428] <4>[ 1864.737437] the existing dependency chain (in reverse order) is: <4>[ 1864.738016] <4>[ 1864.738023] -> #1 (sk_lock-AF_BLUETOOTH){+.+.+.}: <4>[ 1864.738549] [] lock_acquire+0x104/0x140 <4>[ 1864.738977] [] lock_sock_nested+0x58/0x68 <4>[ 1864.739411] [] l2cap_sock_sendmsg+0x3e/0x76 <4>[ 1864.739858] [] __sock_sendmsg+0x50/0x59 <4>[ 1864.740279] [] sock_sendmsg+0x94/0xa8 <4>[ 1864.740687] [] kernel_sendmsg+0x28/0x37 <4>[ 1864.741106] [] rfcomm_send_frame+0x30/0x38 <4>[ 1864.741542] [] rfcomm_send_ua+0x58/0x5a <4>[ 1864.741959] [] rfcomm_run+0x441/0xb52 <4>[ 1864.742365] [] kthread+0x63/0x68 <4>[ 1864.742742] [] kernel_thread_helper+0x6/0xd <4>[ 1864.743187] <4>[ 1864.743193] -> #0 (rfcomm_mutex){+.+.+.}: <4>[ 1864.743667] [] __lock_acquire+0x988/0xc00 <4>[ 1864.744100] [] lock_acquire+0x104/0x140 <4>[ 1864.744519] [] __mutex_lock_common+0x3b/0x33f <4>[ 1864.744975] [] mutex_lock_nested+0x2d/0x36 <4>[ 1864.745412] [] rfcomm_dlc_close+0x15/0x30 <4>[ 1864.745842] [] __rfcomm_sock_close+0x5f/0x6b <4>[ 1864.746288] [] rfcomm_sock_shutdown+0x2f/0x62 <4>[ 1864.746737] [] sys_socketcall+0x1db/0x422 <4>[ 1864.747165] [] syscall_call+0x7/0xb Signed-off-by: Octavian Purdila Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 2 ++ net/bluetooth/af_bluetooth.c | 12 +++++------- net/bluetooth/l2cap_sock.c | 2 ++ net/bluetooth/rfcomm/sock.c | 2 ++ 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index abaad6ed9b8..4a82ca0bb0b 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -256,4 +256,6 @@ void l2cap_exit(void); int sco_init(void); void sco_exit(void); +void bt_sock_reclassify_lock(struct sock *sk, int proto); + #endif /* __BLUETOOTH_H */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ef92864ac62..72eb187a5f6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -71,19 +71,16 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { "slock-AF_BLUETOOTH-BTPROTO_AVDTP", }; -static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) +void bt_sock_reclassify_lock(struct sock *sk, int proto) { - struct sock *sk = sock->sk; - - if (!sk) - return; - + BUG_ON(!sk); BUG_ON(sock_owned_by_user(sk)); sock_lock_init_class_and_name(sk, bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } +EXPORT_SYMBOL(bt_sock_reclassify_lock); int bt_sock_register(int proto, const struct net_proto_family *ops) { @@ -145,7 +142,8 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto, if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { err = bt_proto[proto]->create(net, sock, proto, kern); - bt_sock_reclassify_lock(sock, proto); + if (!err) + bt_sock_reclassify_lock(sock->sk, proto); module_put(bt_proto[proto]->owner); } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index c57027f7606..401d9428ae4 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -849,6 +849,8 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(void *data) if (!sk) return NULL; + bt_sock_reclassify_lock(sk, BTPROTO_L2CAP); + l2cap_sock_init(sk, parent); return l2cap_pi(sk)->chan; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index f066678faee..22169c3f148 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -956,6 +956,8 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * if (!sk) goto done; + bt_sock_reclassify_lock(sk, BTPROTO_RFCOMM); + rfcomm_sock_init(sk, parent); bacpy(&bt_sk(sk)->src, &src); bacpy(&bt_sk(sk)->dst, &dst); -- cgit v1.2.3-70-g09d2 From a51cd2be864a3cc0272359b1995e213341dfc7e7 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Fri, 27 Jan 2012 19:42:02 -0300 Subject: Bluetooth: Fix potential deadlock We don't need to use the _sync variant in hci_conn_hold and hci_conn_put to cancel conn->disc_work delayed work. This way we avoid potential deadlocks like this one reported by lockdep. ====================================================== [ INFO: possible circular locking dependency detected ] 3.2.0+ #1 Not tainted ------------------------------------------------------- kworker/u:1/17 is trying to acquire lock: (&hdev->lock){+.+.+.}, at: [] hci_conn_timeout+0x62/0x158 [bluetooth] but task is already holding lock: ((&(&conn->disc_work)->work)){+.+...}, at: [] process_one_work+0x11a/0x2bf which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 ((&(&conn->disc_work)->work)){+.+...}: [] lock_acquire+0x8a/0xa7 [] wait_on_work+0x3d/0xaa [] __cancel_work_timer+0xac/0xef [] cancel_delayed_work_sync+0xd/0xf [] smp_chan_create+0xde/0xe6 [bluetooth] [] smp_conn_security+0xa3/0x12d [bluetooth] [] l2cap_connect_cfm+0x237/0x2e8 [bluetooth] [] hci_proto_connect_cfm+0x2d/0x6f [bluetooth] [] hci_event_packet+0x29d1/0x2d60 [bluetooth] [] hci_rx_work+0xd0/0x2e1 [bluetooth] [] process_one_work+0x178/0x2bf [] worker_thread+0xce/0x152 [] kthread+0x95/0x9d [] kernel_thread_helper+0x4/0x10 -> #1 (slock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+...}: [] lock_acquire+0x8a/0xa7 [] _raw_spin_lock_bh+0x36/0x6a [] lock_sock_nested+0x24/0x7f [] lock_sock+0xb/0xd [bluetooth] [] l2cap_chan_connect+0xa9/0x26f [bluetooth] [] l2cap_sock_connect+0xb3/0xff [bluetooth] [] sys_connect+0x69/0x8a [] system_call_fastpath+0x16/0x1b -> #0 (&hdev->lock){+.+.+.}: [] __lock_acquire+0xa80/0xd74 [] lock_acquire+0x8a/0xa7 [] __mutex_lock_common+0x48/0x38e [] mutex_lock_nested+0x2a/0x31 [] hci_conn_timeout+0x62/0x158 [bluetooth] [] process_one_work+0x178/0x2bf [] worker_thread+0xce/0x152 [] kthread+0x95/0x9d [] kernel_thread_helper+0x4/0x10 other info that might help us debug this: Chain exists of: &hdev->lock --> slock-AF_BLUETOOTH-BTPROTO_L2CAP --> (&(&conn->disc_work)->work) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((&(&conn->disc_work)->work)); lock(slock-AF_BLUETOOTH-BTPROTO_L2CAP); lock((&(&conn->disc_work)->work)); lock(&hdev->lock); *** DEADLOCK *** 2 locks held by kworker/u:1/17: #0: (hdev->name){.+.+.+}, at: [] process_one_work+0x11a/0x2bf #1: ((&(&conn->disc_work)->work)){+.+...}, at: [] process_one_work+0x11a/0x2bf stack backtrace: Pid: 17, comm: kworker/u:1 Not tainted 3.2.0+ #1 Call Trace: [] print_circular_bug+0x1f8/0x209 [] __lock_acquire+0xa80/0xd74 [] ? arch_local_irq_restore+0x6/0xd [] ? vprintk+0x3f9/0x41e [] lock_acquire+0x8a/0xa7 [] ? hci_conn_timeout+0x62/0x158 [bluetooth] [] __mutex_lock_common+0x48/0x38e [] ? hci_conn_timeout+0x62/0x158 [bluetooth] [] ? __dynamic_pr_debug+0x6d/0x6f [] ? hci_conn_timeout+0x62/0x158 [bluetooth] [] ? trace_hardirqs_off+0xd/0xf [] mutex_lock_nested+0x2a/0x31 [] hci_conn_timeout+0x62/0x158 [bluetooth] [] process_one_work+0x178/0x2bf [] ? process_one_work+0x11a/0x2bf [] ? lock_acquired+0x1d0/0x1df [] ? hci_acl_disconn+0x65/0x65 [bluetooth] [] worker_thread+0xce/0x152 [] ? finish_task_switch+0x45/0xc5 [] ? manage_workers.isra.25+0x16a/0x16a [] kthread+0x95/0x9d [] kernel_thread_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? __init_kthread_worker+0x55/0x55 [] ? gs_change+0x13/0x13 Signed-off-by: Andre Guedes Signed-off-by: Vinicius Costa Gomes Reviewed-by: Ulisses Furquim Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 92b8fea553d..453893b3120 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -540,7 +540,7 @@ void hci_conn_put_device(struct hci_conn *conn); static inline void hci_conn_hold(struct hci_conn *conn) { atomic_inc(&conn->refcnt); - cancel_delayed_work_sync(&conn->disc_work); + cancel_delayed_work(&conn->disc_work); } static inline void hci_conn_put(struct hci_conn *conn) @@ -559,7 +559,7 @@ static inline void hci_conn_put(struct hci_conn *conn) } else { timeo = msecs_to_jiffies(10); } - cancel_delayed_work_sync(&conn->disc_work); + cancel_delayed_work(&conn->disc_work); queue_delayed_work(conn->hdev->workqueue, &conn->disc_work, timeo); } -- cgit v1.2.3-70-g09d2 From 6de32750822d00bfa92c341166132b0714c5b559 Mon Sep 17 00:00:00 2001 From: Ulisses Furquim Date: Mon, 30 Jan 2012 18:26:28 -0200 Subject: Bluetooth: Remove usage of __cancel_delayed_work() __cancel_delayed_work() is being used in some paths where we cannot sleep waiting for the delayed work to finish. However, that function might return while the timer is running and the work will be queued again. Replace the calls with safer cancel_delayed_work() version which spins until the timer handler finishes on other CPUs and cancels the delayed work. Signed-off-by: Ulisses Furquim Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/l2cap.h | 4 ++-- net/bluetooth/l2cap_core.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 26486e182f5..b1664ed884e 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -611,7 +611,7 @@ static inline void l2cap_set_timer(struct l2cap_chan *chan, { BT_DBG("chan %p state %d timeout %ld", chan, chan->state, timeout); - if (!__cancel_delayed_work(work)) + if (!cancel_delayed_work(work)) l2cap_chan_hold(chan); schedule_delayed_work(work, timeout); } @@ -619,7 +619,7 @@ static inline void l2cap_set_timer(struct l2cap_chan *chan, static inline void l2cap_clear_timer(struct l2cap_chan *chan, struct delayed_work *work) { - if (__cancel_delayed_work(work)) + if (cancel_delayed_work(work)) l2cap_chan_put(chan); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index dcccae04ae0..ec10c698b89 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2574,7 +2574,7 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && cmd->ident == conn->info_ident) { - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work(&conn->info_timer); conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; @@ -3121,7 +3121,7 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) return 0; - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work(&conn->info_timer); if (result != L2CAP_IR_SUCCESS) { conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; @@ -4501,7 +4501,7 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) if (hcon->type == LE_LINK) { smp_distribute_keys(conn, 0); - __cancel_delayed_work(&conn->security_timer); + cancel_delayed_work(&conn->security_timer); } rcu_read_lock(); -- cgit v1.2.3-70-g09d2 From 115c9b81928360d769a76c632bae62d15206a94a Mon Sep 17 00:00:00 2001 From: Greg Rose Date: Tue, 21 Feb 2012 16:54:48 -0500 Subject: rtnetlink: Fix problem with buffer allocation Implement a new netlink attribute type IFLA_EXT_MASK. The mask is a 32 bit value that can be used to indicate to the kernel that certain extended ifinfo values are requested by the user application. At this time the only mask value defined is RTEXT_FILTER_VF to indicate that the user wants the ifinfo dump to send information about the VFs belonging to the interface. This patch fixes a bug in which certain applications do not have large enough buffers to accommodate the extra information returned by the kernel with large numbers of SR-IOV virtual functions. Those applications will not send the new netlink attribute with the interface info dump request netlink messages so they will not get unexpectedly large request buffers returned by the kernel. Modifies the rtnl_calcit function to traverse the list of net devices and compute the minimum buffer size that can hold the info dumps of all matching devices based upon the filter passed in via the new netlink attribute filter mask. If no filter mask is sent then the buffer allocation defaults to NLMSG_GOODSIZE. With this change it is possible to add yet to be defined netlink attributes to the dump request which should make it fairly extensible in the future. Signed-off-by: Greg Rose Signed-off-by: David S. Miller --- include/linux/if_link.h | 1 + include/linux/rtnetlink.h | 3 ++ include/net/rtnetlink.h | 2 +- net/core/rtnetlink.c | 78 ++++++++++++++++++++++++++++++++++------------- 4 files changed, 62 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index c52d4b5f872..4b24ff453ae 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -137,6 +137,7 @@ enum { IFLA_AF_SPEC, IFLA_GROUP, /* Group the device belongs to */ IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ __IFLA_MAX }; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 8e872ead88b..577592ea0ea 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -602,6 +602,9 @@ struct tcamsg { #define TCA_ACT_TAB 1 /* attr type must be >=1 */ #define TCAA_MAX 1 +/* New extended info filters for IFLA_EXT_MASK */ +#define RTEXT_FILTER_VF (1 << 0) + /* End of information exported to user level */ #ifdef __KERNEL__ diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 678f1ffaf84..37029390197 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -6,7 +6,7 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, void *); typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); -typedef u16 (*rtnl_calcit_func)(struct sk_buff *); +typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *); extern int __rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 65aebd45002..606a6e8f367 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -60,7 +60,6 @@ struct rtnl_link { }; static DEFINE_MUTEX(rtnl_mutex); -static u16 min_ifinfo_dump_size; void rtnl_lock(void) { @@ -724,10 +723,11 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) } /* All VF info */ -static inline int rtnl_vfinfo_size(const struct net_device *dev) +static inline int rtnl_vfinfo_size(const struct net_device *dev, + u32 ext_filter_mask) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { - + if (dev->dev.parent && dev_is_pci(dev->dev.parent) && + (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(sizeof(struct nlattr)); size += nla_total_size(num_vfs * sizeof(struct nlattr)); @@ -766,7 +766,8 @@ static size_t rtnl_port_size(const struct net_device *dev) return port_self_size; } -static noinline size_t if_nlmsg_size(const struct net_device *dev) +static noinline size_t if_nlmsg_size(const struct net_device *dev, + u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ @@ -784,8 +785,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ - + nla_total_size(4) /* IFLA_NUM_VF */ - + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + nla_total_size(ext_filter_mask + & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ @@ -868,7 +870,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags) + unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -941,10 +943,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; copy_rtnl_link_stats64(nla_data(attr), stats); - if (dev->dev.parent) + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent + && (ext_filter_mask & RTEXT_FILTER_VF)) { int i; struct nlattr *vfinfo, *vf; @@ -1048,6 +1051,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct hlist_head *head; struct hlist_node *node; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; s_h = cb->args[0]; s_idx = cb->args[1]; @@ -1055,6 +1060,12 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->dev_base_seq; + nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + ifla_policy); + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; @@ -1064,7 +1075,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI) <= 0) + NLM_F_MULTI, + ext_filter_mask) <= 0) goto out; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -1100,6 +1112,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, + [IFLA_EXT_MASK] = { .type = NLA_U32 }, }; EXPORT_SYMBOL(ifla_policy); @@ -1509,8 +1522,6 @@ errout: if (send_addr_notify) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - min_ifinfo_dump_size = max_t(u16, if_nlmsg_size(dev), - min_ifinfo_dump_size); return err; } @@ -1842,6 +1853,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) struct net_device *dev = NULL; struct sk_buff *nskb; int err; + u32 ext_filter_mask = 0; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1850,6 +1862,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); @@ -1861,12 +1876,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (dev == NULL) return -ENODEV; - nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); if (nskb == NULL) return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, 0, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -1877,8 +1892,31 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) return err; } -static u16 rtnl_calcit(struct sk_buff *skb) +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + u16 min_ifinfo_dump_size = 0; + + nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ifla_policy); + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + if (!ext_filter_mask) + return NLMSG_GOODSIZE; + /* + * traverse the list of net devices and compute the minimum + * buffer size based upon the filter mask. + */ + list_for_each_entry(dev, &net->dev_base_head, dev_list) { + min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, + if_nlmsg_size(dev, + ext_filter_mask)); + } + return min_ifinfo_dump_size; } @@ -1913,13 +1951,11 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) int err = -ENOBUFS; size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL); + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); if (skb == NULL) goto errout; - min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size); - - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1977,7 +2013,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; calcit = rtnl_get_calcit(family, type); if (calcit) - min_dump_alloc = calcit(skb); + min_dump_alloc = calcit(skb, nlh); __rtnl_unlock(); rtnl = net->rtnl; -- cgit v1.2.3-70-g09d2 From 7d367e06688dc7a2cc98c2ace04e1296e1d987e2 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 24 Feb 2012 11:45:49 +0100 Subject: netfilter: ctnetlink: fix soft lockup when netlink adds new entries (v2) Marcell Zambo and Janos Farago noticed and reported that when new conntrack entries are added via netlink and the conntrack table gets full, soft lockup happens. This is because the nf_conntrack_lock is held while nf_conntrack_alloc is called, which is in turn wants to lock nf_conntrack_lock while evicting entries from the full table. The patch fixes the soft lockup with limiting the holding of the nf_conntrack_lock to the minimum, where it's absolutely required. It required to extend (and thus change) nf_conntrack_hash_insert so that it makes sure conntrack and ctnetlink do not add the same entry twice to the conntrack table. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 38 +++++++++++++++++++++++++---- net/netfilter/nf_conntrack_netlink.c | 46 +++++++++++++----------------------- 3 files changed, 51 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 8a2b0ae7dbd..ab86036bbf0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -209,7 +209,7 @@ extern struct nf_conntrack_tuple_hash * __nf_conntrack_find(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); -extern void nf_conntrack_hash_insert(struct nf_conn *ct); +extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); extern void nf_ct_delete_from_lists(struct nf_conn *ct); extern void nf_ct_insert_dying_list(struct nf_conn *ct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 76613f5a55c..ed86a3be678 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, &net->ct.hash[repl_hash]); } -void nf_conntrack_hash_insert(struct nf_conn *ct) +int +nf_conntrack_hash_check_insert(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, repl_hash; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; u16 zone; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + spin_lock_bh(&nf_conntrack_lock); + /* See if there's one in the list already, including reverse */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + + add_timer(&ct->timeout); + nf_conntrack_get(&ct->ct_general); __nf_conntrack_hash_insert(ct, hash, repl_hash); + NF_CT_STAT_INC(net, insert); + spin_unlock_bh(&nf_conntrack_lock); + + return 0; + +out: + NF_CT_STAT_INC(net, insert_failed); + spin_unlock_bh(&nf_conntrack_lock); + return -EEXIST; } -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); /* Confirm a connection given skb; places it in hash table */ int diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9307b033c0c..30c9d4ca021 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), @@ -1468,8 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, if (tstamp) tstamp->start = ktime_to_ns(ktime_get_real()); - add_timer(&ct->timeout); - nf_conntrack_hash_insert(ct); + err = nf_conntrack_hash_check_insert(ct); + if (err < 0) + goto err2; + rcu_read_unlock(); return ct; @@ -1490,6 +1489,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; u16 zone; int err; @@ -1510,27 +1510,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; } - spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = __nf_conntrack_find(net, zone, &otuple); + h = nf_conntrack_find_get(net, zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = __nf_conntrack_find(net, zone, &rtuple); + h = nf_conntrack_find_get(net, zone, &rtuple); if (h == NULL) { err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - struct nf_conn *ct; enum ip_conntrack_events events; ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); - goto out_unlock; - } + if (IS_ERR(ct)) + return PTR_ERR(ct); + err = 0; - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = IPCT_RELATED; else @@ -1545,23 +1540,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); + } return err; } /* implicit 'else' */ - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ err = -EEXIST; + ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - + spin_lock_bh(&nf_conntrack_lock); err = ctnetlink_change_conntrack(ct, cda); + spin_unlock_bh(&nf_conntrack_lock); if (err == 0) { - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | @@ -1570,15 +1561,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_MARK), ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); - nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); - - return err; + } } -out_unlock: - spin_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); return err; } -- cgit v1.2.3-70-g09d2