diff options
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 266 | 
1 files changed, 136 insertions, 130 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3527b51d615..e68e0d4af6c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,62 +49,12 @@ struct inet_timewait_death_row tcp_death_row = {  };  EXPORT_SYMBOL_GPL(tcp_death_row); -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -static int tcp_remember_stamp(struct sock *sk) -{ -	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); -	struct inet_peer *peer; -	bool release_it; - -	peer = icsk->icsk_af_ops->get_peer(sk, &release_it); -	if (peer) { -		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || -		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && -		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { -			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; -			peer->tcp_ts = tp->rx_opt.ts_recent; -		} -		if (release_it) -			inet_putpeer(peer); -		return 1; -	} - -	return 0; -} - -static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ -	struct sock *sk = (struct sock *) tw; -	struct inet_peer *peer; - -	peer = twsk_getpeer(sk); -	if (peer) { -		const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - -		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || -		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && -		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { -			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; -			peer->tcp_ts	   = tcptw->tw_ts_recent; -		} -		inet_putpeer(peer); -		return 1; -	} -	return 0; -} - -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)  {  	if (seq == s_win) -		return 1; +		return true;  	if (after(end_seq, s_win) && before(seq, e_win)) -		return 1; +		return true;  	return seq == e_win && seq == end_seq;  } @@ -135,21 +85,23 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)   * spinlock it. I do not want! Well, probability of misbehaviour   * is ridiculously low and, seems, we could use some mb() tricks   * to avoid misread sequence numbers, states etc.  --ANK + * + * We don't need to initialize tmp_out.sack_ok as we don't use the results   */  enum tcp_tw_status  tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,  			   const struct tcphdr *th)  {  	struct tcp_options_received tmp_opt; -	u8 *hash_location;  	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); -	int paws_reject = 0; +	bool paws_reject = false;  	tmp_opt.saw_tstamp = 0;  	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { -		tcp_parse_options(skb, &tmp_opt, &hash_location, 0); +		tcp_parse_options(skb, &tmp_opt, 0, NULL);  		if (tmp_opt.saw_tstamp) { +			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;  			tmp_opt.ts_recent	= tcptw->tw_ts_recent;  			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;  			paws_reject = tcp_paws_reject(&tmp_opt, th->rst); @@ -316,7 +268,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  	struct inet_timewait_sock *tw = NULL;  	const struct inet_connection_sock *icsk = inet_csk(sk);  	const struct tcp_sock *tp = tcp_sk(sk); -	int recycle_ok = 0; +	bool recycle_ok = false;  	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)  		recycle_ok = tcp_remember_stamp(sk); @@ -327,23 +279,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  	if (tw != NULL) {  		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);  		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); +		struct inet_sock *inet = inet_sk(sk); +		tw->tw_transparent	= inet->transparent;  		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;  		tcptw->tw_rcv_nxt	= tp->rcv_nxt;  		tcptw->tw_snd_nxt	= tp->snd_nxt;  		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);  		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;  		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; +		tcptw->tw_ts_offset	= tp->tsoffset; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  		if (tw->tw_family == PF_INET6) {  			struct ipv6_pinfo *np = inet6_sk(sk); -			struct inet6_timewait_sock *tw6; -			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); -			tw6 = inet6_twsk((struct sock *)tw); -			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); -			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); +			tw->tw_v6_daddr = sk->sk_v6_daddr; +			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +			tw->tw_tclass = np->tclass; +			tw->tw_flowlabel = np->flow_label >> 12;  			tw->tw_ipv6only = np->ipv6only;  		}  #endif @@ -357,13 +311,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  		 */  		do {  			struct tcp_md5sig_key *key; -			memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); -			tcptw->tw_md5_keylen = 0; +			tcptw->tw_md5_key = NULL;  			key = tp->af_specific->md5_lookup(sk, sk);  			if (key != NULL) { -				memcpy(&tcptw->tw_md5_key, key->key, key->keylen); -				tcptw->tw_md5_keylen = key->keylen; -				if (tcp_alloc_md5sig_pool(sk) == NULL) +				tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); +				if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())  					BUG();  			}  		} while (0); @@ -392,7 +344,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  		 * socket up.  We've got bigger problems than  		 * non-graceful socket closings.  		 */ -		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);  	}  	tcp_update_metrics(sk); @@ -403,12 +355,44 @@ void tcp_twsk_destructor(struct sock *sk)  {  #ifdef CONFIG_TCP_MD5SIG  	struct tcp_timewait_sock *twsk = tcp_twsk(sk); -	if (twsk->tw_md5_keylen) -		tcp_free_md5sig_pool(); + +	if (twsk->tw_md5_key) +		kfree_rcu(twsk->tw_md5_key, rcu);  #endif  }  EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, +			   struct sock *sk, struct dst_entry *dst) +{ +	struct inet_request_sock *ireq = inet_rsk(req); +	struct tcp_sock *tp = tcp_sk(sk); +	__u8 rcv_wscale; +	int mss = dst_metric_advmss(dst); + +	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) +		mss = tp->rx_opt.user_mss; + +	/* Set this up on the first call only */ +	req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + +	/* limit the window selection if the user enforce a smaller rx buffer */ +	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && +	    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) +		req->window_clamp = tcp_full_space(sk); + +	/* tcp_full_space because it is guaranteed to be the first packet */ +	tcp_select_initial_window(tcp_full_space(sk), +		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), +		&req->rcv_wnd, +		&req->window_clamp, +		ireq->wscale_ok, +		&rcv_wscale, +		dst_metric(dst, RTAX_INITRWND)); +	ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); +  static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,  					 struct request_sock *req)  { @@ -423,39 +407,13 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,   */  struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)  { -	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); +	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);  	if (newsk != NULL) {  		const struct inet_request_sock *ireq = inet_rsk(req);  		struct tcp_request_sock *treq = tcp_rsk(req);  		struct inet_connection_sock *newicsk = inet_csk(newsk);  		struct tcp_sock *newtp = tcp_sk(newsk); -		struct tcp_sock *oldtp = tcp_sk(sk); -		struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - -		/* TCP Cookie Transactions require space for the cookie pair, -		 * as it differs for each connection.  There is no need to -		 * copy any s_data_payload stored at the original socket. -		 * Failure will prevent resuming the connection. -		 * -		 * Presumed copied, in order of appearance: -		 *	cookie_in_always, cookie_out_never -		 */ -		if (oldcvp != NULL) { -			struct tcp_cookie_values *newcvp = -				kzalloc(sizeof(*newtp->cookie_values), -					GFP_ATOMIC); - -			if (newcvp != NULL) { -				kref_init(&newcvp->kref); -				newcvp->cookie_desired = -						oldcvp->cookie_desired; -				newtp->cookie_values = newcvp; -			} else { -				/* Not Yet Implemented */ -				newtp->cookie_values = NULL; -			} -		}  		/* Now setup tcp_sock */  		newtp->pred_flags = 0; @@ -464,15 +422,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  		newtp->rcv_nxt = treq->rcv_isn + 1;  		newtp->snd_sml = newtp->snd_una = -		newtp->snd_nxt = newtp->snd_up = -			treq->snt_isn + 1 + tcp_s_data_size(oldtp); +		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;  		tcp_prequeue_init(newtp); +		INIT_LIST_HEAD(&newtp->tsq_node);  		tcp_init_wl(newtp, treq->rcv_isn); -		newtp->srtt = 0; -		newtp->mdev = TCP_TIMEOUT_INIT; +		newtp->srtt_us = 0; +		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);  		newicsk->icsk_rto = TCP_TIMEOUT_INIT;  		newtp->packets_out = 0; @@ -480,26 +438,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  		newtp->sacked_out = 0;  		newtp->fackets_out = 0;  		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +		tcp_enable_early_retrans(newtp); +		newtp->tlp_high_seq = 0; +		newtp->lsndtime = treq->snt_synack; +		newtp->total_retrans = req->num_retrans;  		/* So many TCP implementations out there (incorrectly) count the  		 * initial SYN frame in their delayed-ACK and congestion control  		 * algorithms that we must have the following bandaid to talk  		 * efficiently to them.  -DaveM  		 */ -		newtp->snd_cwnd = 2; +		newtp->snd_cwnd = TCP_INIT_CWND;  		newtp->snd_cwnd_cnt = 0; -		newtp->bytes_acked = 0; -		newtp->frto_counter = 0; -		newtp->frto_highmark = 0; - -		newicsk->icsk_ca_ops = &tcp_init_congestion_ops; +		if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && +		    !try_module_get(newicsk->icsk_ca_ops->owner)) +			newicsk->icsk_ca_ops = &tcp_init_congestion_ops;  		tcp_set_ca_state(newsk, TCP_CA_Open);  		tcp_init_xmit_timers(newsk); -		skb_queue_head_init(&newtp->out_of_order_queue); -		newtp->write_seq = newtp->pushed_seq = -			treq->snt_isn + 1 + tcp_s_data_size(oldtp); +		__skb_queue_head_init(&newtp->out_of_order_queue); +		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;  		newtp->rx_opt.saw_tstamp = 0; @@ -540,6 +499,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  			newtp->rx_opt.ts_recent_stamp = 0;  			newtp->tcp_header_len = sizeof(struct tcphdr);  		} +		newtp->tsoffset = 0;  #ifdef CONFIG_TCP_MD5SIG  		newtp->md5sig_info = NULL;	/*XXX*/  		if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -549,6 +509,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;  		newtp->rx_opt.mss_clamp = req->mss;  		TCP_ECN_openreq_child(newtp, req); +		newtp->fastopen_rsk = NULL; +		newtp->syn_data_acked = 0;  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);  	} @@ -557,24 +519,32 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  EXPORT_SYMBOL(tcp_create_openreq_child);  /* - *	Process an incoming packet for SYN_RECV sockets represented - *	as a request_sock. + * Process an incoming packet for SYN_RECV sockets represented as a + * request_sock. Normally sk is the listener socket but for TFO it + * points to the child socket. + * + * XXX (TFO) - The current impl contains a special check for ack + * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? + * + * We don't need to initialize tmp_opt.sack_ok as we don't use the results   */  struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  			   struct request_sock *req, -			   struct request_sock **prev) +			   struct request_sock **prev, +			   bool fastopen)  {  	struct tcp_options_received tmp_opt; -	u8 *hash_location;  	struct sock *child;  	const struct tcphdr *th = tcp_hdr(skb);  	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); -	int paws_reject = 0; +	bool paws_reject = false; + +	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));  	tmp_opt.saw_tstamp = 0;  	if (th->doff > (sizeof(struct tcphdr)>>2)) { -		tcp_parse_options(skb, &tmp_opt, &hash_location, 0); +		tcp_parse_options(skb, &tmp_opt, 0, NULL);  		if (tmp_opt.saw_tstamp) {  			tmp_opt.ts_recent = req->ts_recent; @@ -582,7 +552,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  			 * it can be estimated (approximately)  			 * from another data.  			 */ -			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); +			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);  			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);  		}  	} @@ -607,8 +577,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  		 *  		 * Enforce "SYN-ACK" according to figure 8, figure 6  		 * of RFC793, fixed by RFC1122. +		 * +		 * Note that even if there is new data in the SYN packet +		 * they will be thrown away too. +		 * +		 * Reset timer after retransmitting SYNACK, similar to +		 * the idea of fast retransmit in recovery.  		 */ -		req->rsk_ops->rtx_syn_ack(sk, req, NULL); +		if (!inet_rtx_syn_ack(sk, req)) +			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, +					   TCP_RTO_MAX) + jiffies;  		return NULL;  	} @@ -664,11 +642,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	 *                  sent (the segment carries an unacceptable ACK) ...  	 *                  a reset is sent."  	 * -	 * Invalid ACK: reset will be sent by listening socket +	 * Invalid ACK: reset will be sent by listening socket. +	 * Note that the ACK validity check for a Fast Open socket is done +	 * elsewhere and is checked directly against the child socket rather +	 * than req because user data may have been sent out.  	 */ -	if ((flg & TCP_FLAG_ACK) && +	if ((flg & TCP_FLAG_ACK) && !fastopen &&  	    (TCP_SKB_CB(skb)->ack_seq != -	     tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) +	     tcp_rsk(req)->snt_isn + 1))  		return sk;  	/* Also, it would be not so bad idea to check rcv_tsecr, which @@ -679,7 +660,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	/* RFC793: "first check sequence number". */  	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, -					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { +					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {  		/* Out of window: send ACK and drop. */  		if (!(flg & TCP_FLAG_RST))  			req->rsk_ops->send_ack(sk, skb, req); @@ -690,7 +671,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	/* In sequence, PAWS is OK. */ -	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) +	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))  		req->ts_recent = tmp_opt.rcv_tsval;  	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { @@ -709,12 +690,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	/* ACK sequence verified above, just make sure ACK is  	 * set.  If ACK not set, just silently drop the packet. +	 * +	 * XXX (TFO) - if we ever allow "data after SYN", the +	 * following check needs to be removed.  	 */  	if (!(flg & TCP_FLAG_ACK))  		return NULL; +	/* For Fast Open no more processing is needed (sk is the +	 * child socket). +	 */ +	if (fastopen) +		return sk; +  	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ -	if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && +	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&  	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {  		inet_rsk(req)->acked = 1;  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); @@ -744,11 +734,21 @@ listen_overflow:  	}  embryonic_reset: -	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); -	if (!(flg & TCP_FLAG_RST)) +	if (!(flg & TCP_FLAG_RST)) { +		/* Received a bad SYN pkt - for TFO We try not to reset +		 * the local connection unless it's really necessary to +		 * avoid becoming vulnerable to outside attack aiming at +		 * resetting legit local connections. +		 */  		req->rsk_ops->send_reset(sk, skb); - -	inet_csk_reqsk_queue_drop(sk, req, prev); +	} else if (fastopen) { /* received a valid RST pkt */ +		reqsk_fastopen_remove(sk, req, true); +		tcp_reset(sk); +	} +	if (!fastopen) { +		inet_csk_reqsk_queue_drop(sk, req, prev); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); +	}  	return NULL;  }  EXPORT_SYMBOL(tcp_check_req); @@ -757,6 +757,12 @@ EXPORT_SYMBOL(tcp_check_req);   * Queue segment on the new socket if the new socket is active,   * otherwise we just shortcircuit this and continue with   * the new socket. + * + * For the vast majority of cases child->sk_state will be TCP_SYN_RECV + * when entering. But other states are possible due to a race condition + * where after __inet_lookup_established() fails but before the listener + * locked is obtained, other packets cause the same connection to + * be created.   */  int tcp_child_process(struct sock *parent, struct sock *child, @@ -770,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,  					    skb->len);  		/* Wakeup parent, send SIGIO */  		if (state == TCP_SYN_RECV && child->sk_state != state) -			parent->sk_data_ready(parent, 0); +			parent->sk_data_ready(parent);  	} else {  		/* Alas, it is possible again, because we do lookup  		 * in main socket hash table and lock on listening  | 
