diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 281 | 
1 files changed, 148 insertions, 133 deletions
| diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d377f4854cb..432c36649db 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s  			tcp_enter_quickack_mode((struct sock *)tp);  		break;  	case INET_ECN_CE: -		tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +			/* Better not delay acks, sender can have a very low cwnd */ +			tcp_enter_quickack_mode((struct sock *)tp); +			tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +		}  		/* fallinto */  	default:  		tp->ecn_flags |= TCP_ECN_SEEN; @@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)  /* 4. Try to fixup all. It is made immediately after connection enters   *    established state.   */ -static void tcp_init_buffer_space(struct sock *sk) +void tcp_init_buffer_space(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int maxwin; @@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)  	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);  } -/* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	const struct inet_connection_sock *icsk = inet_csk(sk); - -	tp->prior_ssthresh = 0; -	tp->bytes_acked = 0; -	if (icsk->icsk_ca_state < TCP_CA_CWR) { -		tp->undo_marker = 0; -		if (set_ssthresh) -			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); -		tp->snd_cwnd = min(tp->snd_cwnd, -				   tcp_packets_in_flight(tp) + 1U); -		tp->snd_cwnd_cnt = 0; -		tp->high_seq = tp->snd_nxt; -		tp->snd_cwnd_stamp = tcp_time_stamp; -		TCP_ECN_queue_cwr(tp); - -		tcp_set_ca_state(sk, TCP_CA_CWR); -	} -} -  /*   * Packet counting of FACK is based on in-order assumptions, therefore TCP   * disables it when reordering is detected @@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)  	tp->snd_cwnd_stamp = tcp_time_stamp;  } -/* Lower bound on congestion window is slow start threshold - * unless congestion avoidance choice decides to overide it. - */ -static inline u32 tcp_cwnd_min(const struct sock *sk) -{ -	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - -	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; -} - -/* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk, int flag) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	int decr = tp->snd_cwnd_cnt + 1; - -	if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || -	    (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { -		tp->snd_cwnd_cnt = decr & 1; -		decr >>= 1; - -		if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) -			tp->snd_cwnd -= decr; - -		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); -		tp->snd_cwnd_stamp = tcp_time_stamp; -	} -} -  /* Nothing was retransmitted or returned timestamp is less   * than timestamp of the first retransmission.   */ @@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk)  	return false;  } -static inline void tcp_complete_cwr(struct sock *sk) +/* The cwnd reduction in CWR and Recovery use the PRR algorithm + * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ + * It computes the number of packets to send (sndcnt) based on packets newly + * delivered: + *   1) If the packets in flight is larger than ssthresh, PRR spreads the + *	cwnd reductions across a full RTT. + *   2) If packets in flight is lower than ssthresh (such as due to excess + *	losses and/or application stalls), do not perform any further cwnd + *	reductions, but instead slow start up to ssthresh. + */ +static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)  {  	struct tcp_sock *tp = tcp_sk(sk); -	/* Do not moderate cwnd if it's already undone in cwr or recovery. */ -	if (tp->undo_marker) { -		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { -			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); -			tp->snd_cwnd_stamp = tcp_time_stamp; -		} else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { -			/* PRR algorithm. */ -			tp->snd_cwnd = tp->snd_ssthresh; -			tp->snd_cwnd_stamp = tcp_time_stamp; -		} +	tp->high_seq = tp->snd_nxt; +	tp->bytes_acked = 0; +	tp->snd_cwnd_cnt = 0; +	tp->prior_cwnd = tp->snd_cwnd; +	tp->prr_delivered = 0; +	tp->prr_out = 0; +	if (set_ssthresh) +		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); +	TCP_ECN_queue_cwr(tp); +} + +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, +			       int fast_rexmit) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int sndcnt = 0; +	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + +	tp->prr_delivered += newly_acked_sacked; +	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { +		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + +			       tp->prior_cwnd - 1; +		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; +	} else { +		sndcnt = min_t(int, delta, +			       max_t(int, tp->prr_delivered - tp->prr_out, +				     newly_acked_sacked) + 1); +	} + +	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); +	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; +} + +static inline void tcp_end_cwnd_reduction(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ +	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || +	    (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { +		tp->snd_cwnd = tp->snd_ssthresh; +		tp->snd_cwnd_stamp = tcp_time_stamp;  	}  	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);  } +/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	tp->prior_ssthresh = 0; +	tp->bytes_acked = 0; +	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { +		tp->undo_marker = 0; +		tcp_init_cwnd_reduction(sk, set_ssthresh); +		tcp_set_ca_state(sk, TCP_CA_CWR); +	} +} +  static void tcp_try_keep_open(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)  	}  } -static void tcp_try_to_open(struct sock *sk, int flag) +static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)  		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)  			tcp_moderate_cwnd(tp);  	} else { -		tcp_cwnd_down(sk, flag); +		tcp_cwnd_reduction(sk, newly_acked_sacked, 0);  	}  } @@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk)  }  EXPORT_SYMBOL(tcp_simple_retransmit); -/* This function implements the PRR algorithm, specifcally the PRR-SSRB - * (proportional rate reduction with slow start reduction bound) as described in - * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt. - * It computes the number of packets to send (sndcnt) based on packets newly - * delivered: - *   1) If the packets in flight is larger than ssthresh, PRR spreads the - *	cwnd reductions across a full RTT. - *   2) If packets in flight is lower than ssthresh (such as due to excess - *	losses and/or application stalls), do not perform any further cwnd - *	reductions, but instead slow start up to ssthresh. - */ -static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, -					int fast_rexmit, int flag) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	int sndcnt = 0; -	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - -	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { -		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + -			       tp->prior_cwnd - 1; -		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; -	} else { -		sndcnt = min_t(int, delta, -			       max_t(int, tp->prr_delivered - tp->prr_out, -				     newly_acked_sacked) + 1); -	} - -	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); -	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; -} -  static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  	NET_INC_STATS_BH(sock_net(sk), mib_idx); -	tp->high_seq = tp->snd_nxt;  	tp->prior_ssthresh = 0;  	tp->undo_marker = tp->snd_una;  	tp->undo_retrans = tp->retrans_out; @@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {  		if (!ece_ack)  			tp->prior_ssthresh = tcp_current_ssthresh(sk); -		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); -		TCP_ECN_queue_cwr(tp); +		tcp_init_cwnd_reduction(sk, true);  	} - -	tp->bytes_acked = 0; -	tp->snd_cwnd_cnt = 0; -	tp->prior_cwnd = tp->snd_cwnd; -	tp->prr_delivered = 0; -	tp->prr_out = 0;  	tcp_set_ca_state(sk, TCP_CA_Recovery);  } @@ -2970,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,  			/* CWR is to be held something *above* high_seq  			 * is ACKed for CWR bit to reach receiver. */  			if (tp->snd_una != tp->high_seq) { -				tcp_complete_cwr(sk); +				tcp_end_cwnd_reduction(sk);  				tcp_set_ca_state(sk, TCP_CA_Open);  			}  			break; @@ -2980,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,  				tcp_reset_reno_sack(tp);  			if (tcp_try_undo_recovery(sk))  				return; -			tcp_complete_cwr(sk); +			tcp_end_cwnd_reduction(sk);  			break;  		}  	} @@ -3021,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,  			tcp_try_undo_dsack(sk);  		if (!tcp_time_to_recover(sk, flag)) { -			tcp_try_to_open(sk, flag); +			tcp_try_to_open(sk, flag, newly_acked_sacked);  			return;  		} @@ -3043,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,  	if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))  		tcp_update_scoreboard(sk, fast_rexmit); -	tp->prr_delivered += newly_acked_sacked; -	tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag); +	tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);  	tcp_xmit_retransmit_queue(sk);  } @@ -3123,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	/* If the retrans timer is currently being used by Fast Open +	 * for SYN-ACK retrans purpose, stay put. +	 */ +	if (tp->fastopen_rsk) +		return; +  	if (!tp->packets_out) {  		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);  	} else { @@ -3384,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && -		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); +		!tcp_in_cwnd_reduction(sk);  }  /* Check that window update is acceptable. @@ -3452,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)  }  /* A conservative spurious RTO response algorithm: reduce cwnd using - * rate halving and continue in congestion avoidance. + * PRR and continue in congestion avoidance.   */ -static void tcp_ratehalving_spur_to_response(struct sock *sk) +static void tcp_cwr_spur_to_response(struct sock *sk)  {  	tcp_enter_cwr(sk, 0);  } @@ -3462,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)  static void tcp_undo_spur_to_response(struct sock *sk, int flag)  {  	if (flag & FLAG_ECE) -		tcp_ratehalving_spur_to_response(sk); +		tcp_cwr_spur_to_response(sk);  	else  		tcp_undo_cwr(sk, true);  } @@ -3569,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)  			tcp_conservative_spur_to_response(tp);  			break;  		default: -			tcp_ratehalving_spur_to_response(sk); +			tcp_cwr_spur_to_response(sk);  			break;  		}  		tp->frto_counter = 0; @@ -4034,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)  }  /* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk)  {  	/* We want the right error as BSD sees it (and indeed as we do). */  	switch (sk->sk_state) { @@ -5740,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		TCP_ECN_rcv_synack(tp, th); -		tp->snd_wl1 = TCP_SKB_CB(skb)->seq; +		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);  		tcp_ack(sk, skb, FLAG_SLOWPATH);  		/* Ok.. it's good. Set up sequence numbers and @@ -5753,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		 * never scaled.  		 */  		tp->snd_wnd = ntohs(th->window); -		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);  		if (!tp->rx_opt.wscale_ok) {  			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5891,7 +5863,9 @@ discard:  		tcp_send_synack(sk);  #if 0  		/* Note, we could accept data and URG from this segment. -		 * There are no obstacles to make this. +		 * There are no obstacles to make this (except that we must +		 * either change tcp_recvmsg() to prevent it from returning data +		 * before 3WHS completes per RFC793, or employ TCP Fast Open).  		 *  		 * However, if we ignore data in ACKless segments sometimes,  		 * we have no reasons to accept it sometimes. @@ -5931,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct request_sock *req;  	int queued = 0;  	tp->rx_opt.saw_tstamp = 0; @@ -5986,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		return 0;  	} +	req = tp->fastopen_rsk; +	if (req != NULL) { +		BUG_ON(sk->sk_state != TCP_SYN_RECV && +		    sk->sk_state != TCP_FIN_WAIT1); + +		if (tcp_check_req(sk, skb, req, NULL, true) == NULL) +			goto discard; +	}  	if (!tcp_validate_incoming(sk, skb, th, 0))  		return 0; @@ -5996,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		switch (sk->sk_state) {  		case TCP_SYN_RECV:  			if (acceptable) { -				tp->copied_seq = tp->rcv_nxt; +				/* Once we leave TCP_SYN_RECV, we no longer +				 * need req so release it. +				 */ +				if (req) { +					tcp_synack_rtt_meas(sk, req); +					tp->total_retrans = req->retrans; + +					reqsk_fastopen_remove(sk, req, false); +				} else { +					/* Make sure socket is routed, for +					 * correct metrics. +					 */ +					icsk->icsk_af_ops->rebuild_header(sk); +					tcp_init_congestion_control(sk); + +					tcp_mtup_init(sk); +					tcp_init_buffer_space(sk); +					tp->copied_seq = tp->rcv_nxt; +				}  				smp_mb();  				tcp_set_state(sk, TCP_ESTABLISHED);  				sk->sk_state_change(sk); @@ -6018,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  				if (tp->rx_opt.tstamp_ok)  					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -				/* Make sure socket is routed, for -				 * correct metrics. -				 */ -				icsk->icsk_af_ops->rebuild_header(sk); - -				tcp_init_metrics(sk); - -				tcp_init_congestion_control(sk); +				if (req) { +					/* Re-arm the timer because data may +					 * have been sent out. This is similar +					 * to the regular data transmission case +					 * when new data has just been ack'ed. +					 * +					 * (TFO) - we could try to be more +					 * aggressive and retranmitting any data +					 * sooner based on when they were sent +					 * out. +					 */ +					tcp_rearm_rto(sk); +				} else +					tcp_init_metrics(sk);  				/* Prevent spurious tcp_cwnd_restart() on  				 * first data packet.  				 */  				tp->lsndtime = tcp_time_stamp; -				tcp_mtup_init(sk);  				tcp_initialize_rcv_mss(sk); -				tcp_init_buffer_space(sk);  				tcp_fast_path_on(tp);  			} else {  				return 1; @@ -6042,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  			break;  		case TCP_FIN_WAIT1: +			/* If we enter the TCP_FIN_WAIT1 state and we are a +			 * Fast Open socket and this is the first acceptable +			 * ACK we have received, this would have acknowledged +			 * our SYNACK so stop the SYNACK timer. +			 */ +			if (acceptable && req != NULL) { +				/* We no longer need the request sock. */ +				reqsk_fastopen_remove(sk, req, false); +				tcp_rearm_rto(sk); +			}  			if (tp->snd_una == tp->write_seq) {  				struct dst_entry *dst; | 
