diff options
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
| -rw-r--r-- | net/ipv4/inet_connection_sock.c | 522 |
1 files changed, 377 insertions, 145 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b189278c7bc..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,50 +29,61 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -/* - * This array holds the first and last local port number. - */ -int sysctl_local_port_range[2] = { 32768, 61000 }; -DEFINE_SEQLOCK(sysctl_port_range_lock); - -void inet_get_local_port_range(int *low, int *high) +void inet_get_local_port_range(struct net *net, int *low, int *high) { - unsigned seq; + unsigned int seq; + do { - seq = read_seqbegin(&sysctl_port_range_lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - *low = sysctl_local_port_range[0]; - *high = sysctl_local_port_range[1]; - } while (read_seqretry(&sysctl_port_range_lock, seq)); + *low = net->ipv4.ip_local_ports.range[0]; + *high = net->ipv4.ip_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); int inet_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) + const struct inet_bind_bucket *tb, bool relax) { - const __be32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; - struct hlist_node *node; int reuse = sk->sk_reuse; + int reuseport = sk->sk_reuseport; + kuid_t uid = sock_i_uid((struct sock *)sk); + + /* + * Unlike other sk lookup places we do not check + * for sk_net here, since _all_ the socks listed + * in tb->owners list belong to the same net - the + * one this bucket belongs to. + */ - sk_for_each_bound(sk2, node, &tb->owners) { + sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { - const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) + if ((!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + (!reuseport || !sk2->sk_reuseport || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, sock_i_uid(sk2))))) { + + if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || + sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + break; + } + if (!relax && reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN) { + + if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || + sk2->sk_rcv_saddr == sk->sk_rcv_saddr) break; } } } - return node != NULL; + return sk2 != NULL; } - EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, @@ -80,30 +91,57 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_bind_hashbucket *head; - struct hlist_node *node; struct inet_bind_bucket *tb; - int ret; - struct net *net = sk->sk_net; + int ret, attempts = 5; + struct net *net = sock_net(sk); + int smallest_size = -1, smallest_rover; + kuid_t uid = sock_i_uid(sk); local_bh_disable(); if (!snum) { int remaining, rover, low, high; - inet_get_local_port_range(&low, &high); +again: + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = net_random() % remaining + low; + smallest_rover = rover = prandom_u32() % remaining + low; + smallest_size = -1; do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + if (inet_is_local_reserved_port(net, rover)) + goto next_nolock; + head = &hashinfo->bhash[inet_bhashfn(net, rover, + hashinfo->bhash_size)]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->ib_net == net && tb->port == rover) + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == rover) { + if (((tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + uid_eq(tb->fastuid, uid))) && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_rover = rover; + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && + !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { + snum = smallest_rover; + goto tb_found; + } + } + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { + snum = rover; + goto tb_found; + } goto next; + } break; next: spin_unlock(&head->lock); + next_nolock: if (++rover > high) rover = low; } while (--remaining > 0); @@ -115,33 +153,52 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) * the top level, not from the 'break;' statement. */ ret = 1; - if (remaining <= 0) + if (remaining <= 0) { + if (smallest_size != -1) { + snum = smallest_rover; + goto have_snum; + } goto fail; - + } /* OK, here is the one we will use. HEAD is * non-NULL and we hold it's mutex. */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; +have_snum: + head = &hashinfo->bhash[inet_bhashfn(net, snum, + hashinfo->bhash_size)]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->ib_net == net && tb->port == snum) + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } tb = NULL; goto tb_not_found; tb_found: if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) + if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + + if (((tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && + smallest_size == -1) { goto success; } else { ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { + if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && + smallest_size != -1 && --attempts >= 0) { + spin_unlock(&head->lock); + goto again; + } + goto fail_unlock; + } } } tb_not_found: @@ -154,13 +211,23 @@ tb_not_found: tb->fastreuse = 1; else tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; + if (sk->sk_reuseport) { + tb->fastreuseport = 1; + tb->fastuid = uid; + } else + tb->fastreuseport = 0; + } else { + if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; + if (tb->fastreuseport && + (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + tb->fastreuseport = 0; + } success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); + WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: @@ -169,7 +236,6 @@ fail: local_bh_enable(); return ret; } - EXPORT_SYMBOL_GPL(inet_csk_get_port); /* @@ -197,7 +263,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) @@ -216,7 +282,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) if (!timeo) break; } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; } @@ -226,7 +292,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct sock *newsk; + struct request_sock *req; int error; lock_sock(sk); @@ -239,7 +307,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) goto out_err; /* Find already established connection */ - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ @@ -251,18 +319,35 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - - newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); + req = reqsk_queue_remove(queue); + newsk = req->sk; + + sk_acceptq_removed(sk); + if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { + spin_lock_bh(&queue->fastopenq->lock); + if (tcp_rsk(req)->listener) { + /* We are still waiting for the final ACK from 3WHS + * so can't free req now. Instead, we set req->sk to + * NULL to signify that the child socket is taken + * so reqsk_fastopen_remove() will free the req + * when 3WHS finishes (or is aborted). + */ + req->sk = NULL; + req = NULL; + } + spin_unlock_bh(&queue->fastopenq->lock); + } out: release_sock(sk); + if (req) + __reqsk_free(req); return newsk; out_err: newsk = NULL; + req = NULL; *err = error; goto out; } - EXPORT_SYMBOL(inet_csk_accept); /* @@ -284,7 +369,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } - EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) @@ -297,63 +381,97 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } - EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } - EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } - EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -struct dst_entry* inet_csk_route_req(struct sock *sk, +struct dst_entry *inet_csk_route_req(struct sock *sk, + struct flowi4 *fl4, const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = ((opt && opt->srr) ? - opt->faddr : - ireq->rmt_addr), - .saddr = ireq->loc_addr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = sk->sk_protocol, - .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, - .dport = ireq->rmt_port } } }; - - security_req_classify_flow(req, &fl); - if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - return &rt->u.dst; + struct ip_options_rcu *opt = inet_rsk(req)->opt; + struct net *net = sock_net(sk); + int flags = inet_sk_flowi_flags(sk); + + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, + flags, + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, + ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) + goto no_route; + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + goto route_err; + return &rt->dst; + +route_err: + ip_rt_put(rt); +no_route: + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; } - EXPORT_SYMBOL_GPL(inet_csk_route_req); +struct dst_entry *inet_csk_route_child_sock(struct sock *sk, + struct sock *newsk, + const struct request_sock *req) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *newinet = inet_sk(newsk); + struct ip_options_rcu *opt; + struct net *net = sock_net(sk); + struct flowi4 *fl4; + struct rtable *rt; + + fl4 = &newinet->cork.fl.u.ip4; + + rcu_read_lock(); + opt = rcu_dereference(newinet->inet_opt); + flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, + ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) + goto no_route; + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + goto route_err; + rcu_read_unlock(); + return &rt->dst; + +route_err: + ip_rt_put(rt); +no_route: + rcu_read_unlock(); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); + static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, const u32 synq_hsize) { return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) 1 @@ -374,11 +492,11 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, prev = &req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); - if (ireq->rmt_port == rport && - ireq->rmt_addr == raddr && - ireq->loc_addr == laddr && + if (ireq->ir_rmt_port == rport && + ireq->ir_rmt_addr == raddr && + ireq->ir_loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); + WARN_ON(req->sk); *prevp = prev; break; } @@ -386,7 +504,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, return req; } - EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, @@ -394,17 +511,50 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, + inet_rsk(req)->ir_rmt_port, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); inet_csk_reqsk_queue_added(sk, timeout); } +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); + +/* Decide when to expire the request and when to resend SYN-ACK */ +static inline void syn_ack_recalc(struct request_sock *req, const int thresh, + const int max_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) +{ + if (!rskq_defer_accept) { + *expire = req->num_timeout >= thresh; + *resend = 1; + return; + } + *expire = req->num_timeout >= thresh && + (!inet_rsk(req)->acked || req->num_timeout >= max_retries); + /* + * Do not resend while waiting for data after ACK, + * start to resend on end of deferring period to give + * last chance for data or ACK to create established socket. + */ + *resend = !inet_rsk(req)->acked || + req->num_timeout >= rskq_defer_accept - 1; +} + +int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +{ + int err = req->rsk_ops->rtx_syn_ack(parent, req); + + if (!err) + req->num_retrans++; + return err; +} +EXPORT_SYMBOL(inet_rtx_syn_ack); void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long interval, @@ -425,7 +575,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. - * If synack was not acknowledged for 3 seconds, it means + * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old @@ -461,14 +611,22 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + int expire = 0, resend = 0; + + syn_ack_recalc(req, thresh, max_retries, + queue->rskq_defer_accept, + &expire, &resend); + req->rsk_ops->syn_ack_timeout(parent, req); + if (!expire && + (!resend || + !inet_rtx_syn_ack(parent, req) || + inet_rsk(req)->acked)) { unsigned long timeo; - if (req->retrans++ == 0) + if (req->num_timeout++ == 0) lopt->qlen_young--; - timeo = min((timeout << req->retrans), max_rto); + timeo = min(timeout << req->num_timeout, + max_rto); req->expires = now + timeo; reqp = &req->dl_next; continue; @@ -492,13 +650,21 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, if (lopt->qlen) inet_csk_reset_keepalive_timer(parent, interval); } - EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); -struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, - const gfp_t priority) +/** + * inet_csk_clone_lock - clone an inet socket, and lock its clone + * @sk: the socket to clone + * @req: request_sock + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *inet_csk_clone_lock(const struct sock *sk, + const struct request_sock *req, + const gfp_t priority) { - struct sock *newsk = sk_clone(sk, priority); + struct sock *newsk = sk_clone_lock(sk, priority); if (newsk != NULL) { struct inet_connection_sock *newicsk = inet_csk(newsk); @@ -506,9 +672,13 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; - inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; + inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; + inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + newsk->sk_mark = inet_rsk(req)->ir_mark; + newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; @@ -520,8 +690,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, } return newsk; } - -EXPORT_SYMBOL_GPL(inet_csk_clone); +EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -531,14 +700,14 @@ EXPORT_SYMBOL_GPL(inet_csk_clone); */ void inet_csk_destroy_sock(struct sock *sk) { - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + WARN_ON(sk->sk_state != TCP_CLOSE); + WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); + WARN_ON(!sk_unhashed(sk)); - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -548,12 +717,28 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - atomic_dec(sk->sk_prot->orphan_count); + percpu_counter_dec(sk->sk_prot->orphan_count); sock_put(sk); } - EXPORT_SYMBOL(inet_csk_destroy_sock); +/* This function allows to force a closure of a socket after the call to + * tcp/dccp_create_openreq_child(). + */ +void inet_csk_prepare_forced_close(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + /* sk_clone_lock locked the socket and set refcnt to 2 */ + bh_unlock_sock(sk); + sock_put(sk); + + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + percpu_counter_inc(sk->sk_prot->orphan_count); + inet_sk(sk)->inet_num = 0; +} +EXPORT_SYMBOL(inet_csk_prepare_forced_close); + int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); @@ -573,8 +758,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) * after validation is complete. */ sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); + if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); sk->sk_prot->hash(sk); @@ -586,7 +771,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } - EXPORT_SYMBOL_GPL(inet_csk_listen_start); /* @@ -596,13 +780,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *acc_req; struct request_sock *req; inet_csk_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + acc_req = reqsk_queue_yank_acceptq(queue); /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -612,7 +797,7 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(&icsk->icsk_accept_queue); + reqsk_queue_destroy(queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; @@ -621,15 +806,28 @@ void inet_csk_listen_stop(struct sock *sk) local_bh_disable(); bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); + WARN_ON(sock_owned_by_user(child)); sock_hold(child); sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); - atomic_inc(sk->sk_prot->orphan_count); + percpu_counter_inc(sk->sk_prot->orphan_count); + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != tcp_rsk(req)->listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + sock_put(sk); + } inet_csk_destroy_sock(child); bh_unlock_sock(child); @@ -639,9 +837,19 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); __reqsk_free(req); } - BUG_TRAP(!sk->sk_ack_backlog); + if (queue->fastopenq != NULL) { + /* Free all the reqs queued in rskq_rst_head. */ + spin_lock_bh(&queue->fastopenq->lock); + acc_req = queue->fastopenq->rskq_rst_head; + queue->fastopenq->rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq->lock); + while ((req = acc_req) != NULL) { + acc_req = req->dl_next; + __reqsk_free(req); + } + } + WARN_ON(sk->sk_ack_backlog); } - EXPORT_SYMBOL_GPL(inet_csk_listen_stop); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) @@ -650,31 +858,11 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; + sin->sin_addr.s_addr = inet->inet_daddr; + sin->sin_port = inet->inet_dport; } - EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); -int inet_csk_ctl_sock_create(struct socket **sock, unsigned short family, - unsigned short type, unsigned char protocol) -{ - int rc = sock_create_kern(family, type, protocol, sock); - - if (rc == 0) { - (*sock)->sk->sk_allocation = GFP_ATOMIC; - inet_sk((*sock)->sk)->uc_ttl = -1; - /* - * Unhash it so that IP input processing does not even see it, - * we do not wish this socket to see incoming packets. - */ - (*sock)->sk->sk_prot->unhash((*sock)->sk); - } - return rc; -} - -EXPORT_SYMBOL_GPL(inet_csk_ctl_sock_create); - #ifdef CONFIG_COMPAT int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -687,11 +875,10 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, return icsk->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -701,6 +888,51 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, return icsk->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); #endif + +static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + fl4 = &fl->u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_protocol, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + if (IS_ERR(rt)) + rt = NULL; + if (rt) + sk_setup_caps(sk, &rt->dst); + rcu_read_unlock(); + + return &rt->dst; +} + +struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + struct inet_sock *inet = inet_sk(sk); + + if (!dst) { + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); + if (!dst) + goto out; + } + dst->ops->update_pmtu(dst, sk, NULL, mtu); + + dst = __sk_dst_check(sk, 0); + if (!dst) + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +out: + return dst; +} +EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); |
