diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 425 | 
1 files changed, 231 insertions, 194 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 25a89eaa669..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr   * 1. Tuning sk->sk_sndbuf, when connection enters established state.   */ -static void tcp_fixup_sndbuf(struct sock *sk) +static void tcp_sndbuf_expand(struct sock *sk)  { -	int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); +	const struct tcp_sock *tp = tcp_sk(sk); +	int sndmem, per_mss; +	u32 nr_segs; + +	/* Worst case is non GSO/TSO : each frame consumes one skb +	 * and skb->head is kmalloced using power of two area of memory +	 */ +	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + +		  MAX_TCP_HEADER + +		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + +	per_mss = roundup_pow_of_two(per_mss) + +		  SKB_DATA_ALIGN(sizeof(struct sk_buff)); + +	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); +	nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + +	/* Fast Recovery (RFC 5681 3.2) : +	 * Cubic needs 1.7 factor, rounded to 2 to include +	 * extra cushion (application might react slowly to POLLOUT) +	 */ +	sndmem = 2 * nr_segs * per_mss; -	sndmem *= TCP_INIT_CWND;  	if (sk->sk_sndbuf < sndmem)  		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);  } @@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)  	rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *  		 tcp_default_init_rwnd(mss); +	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency +	 * Allow enough cushion so that sender is not limited by our window +	 */ +	if (sysctl_tcp_moderate_rcvbuf) +		rcvmem <<= 2; +  	if (sk->sk_rcvbuf < rcvmem)  		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);  } @@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk)  	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))  		tcp_fixup_rcvbuf(sk);  	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) -		tcp_fixup_sndbuf(sk); +		tcp_sndbuf_expand(sk);  	tp->rcvq_space.space = tp->rcv_wnd; +	tp->rcvq_space.time = tcp_time_stamp; +	tp->rcvq_space.seq = tp->copied_seq;  	maxwin = tcp_full_space(sk); @@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int time; -	int space; - -	if (tp->rcvq_space.time == 0) -		goto new_measure; +	int copied;  	time = tcp_time_stamp - tp->rcvq_space.time;  	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)  		return; -	space = 2 * (tp->copied_seq - tp->rcvq_space.seq); +	/* Number of bytes copied to user in last RTT */ +	copied = tp->copied_seq - tp->rcvq_space.seq; +	if (copied <= tp->rcvq_space.space) +		goto new_measure; -	space = max(tp->rcvq_space.space, space); +	/* A bit of theory : +	 * copied = bytes received in previous RTT, our base window +	 * To cope with packet losses, we need a 2x factor +	 * To cope with slow start, and sender growing its cwin by 100 % +	 * every RTT, we need a 4x factor, because the ACK we are sending +	 * now is for the next RTT, not the current one : +	 * <prev RTT . ><current RTT .. ><next RTT .... > +	 */ -	if (tp->rcvq_space.space != space) { -		int rcvmem; +	if (sysctl_tcp_moderate_rcvbuf && +	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { +		int rcvwin, rcvmem, rcvbuf; -		tp->rcvq_space.space = space; +		/* minimal window to cope with packet losses, assuming +		 * steady state. Add some cushion because of small variations. +		 */ +		rcvwin = (copied << 1) + 16 * tp->advmss; -		if (sysctl_tcp_moderate_rcvbuf && -		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { -			int new_clamp = space; +		/* If rate increased by 25%, +		 *	assume slow start, rcvwin = 3 * copied +		 * If rate increased by 50%, +		 *	assume sender can use 2x growth, rcvwin = 4 * copied +		 */ +		if (copied >= +		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { +			if (copied >= +			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) +				rcvwin <<= 1; +			else +				rcvwin += (rcvwin >> 1); +		} -			/* Receive space grows, normalize in order to -			 * take into account packet headers and sk_buff -			 * structure overhead. -			 */ -			space /= tp->advmss; -			if (!space) -				space = 1; -			rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); -			while (tcp_win_from_space(rcvmem) < tp->advmss) -				rcvmem += 128; -			space *= rcvmem; -			space = min(space, sysctl_tcp_rmem[2]); -			if (space > sk->sk_rcvbuf) { -				sk->sk_rcvbuf = space; - -				/* Make the window clamp follow along.  */ -				tp->window_clamp = new_clamp; -			} +		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); +		while (tcp_win_from_space(rcvmem) < tp->advmss) +			rcvmem += 128; + +		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); +		if (rcvbuf > sk->sk_rcvbuf) { +			sk->sk_rcvbuf = rcvbuf; + +			/* Make the window clamp follow along.  */ +			tp->window_clamp = rcvwin;  		}  	} +	tp->rcvq_space.space = copied;  new_measure:  	tp->rcvq_space.seq = tp->copied_seq; @@ -625,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)   * To save cycles in the RFC 1323 implementation it was better to break   * it up into three procedures. -- erics   */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)  {  	struct tcp_sock *tp = tcp_sk(sk); -	long m = mrtt; /* RTT */ +	long m = mrtt_us; /* RTT */ +	u32 srtt = tp->srtt_us;  	/*	The following amusing code comes from Jacobson's  	 *	article in SIGCOMM '88.  Note that rtt and mdev @@ -646,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  	 * does not matter how to _calculate_ it. Seems, it was trap  	 * that VJ failed to avoid. 8)  	 */ -	if (m == 0) -		m = 1; -	if (tp->srtt != 0) { -		m -= (tp->srtt >> 3);	/* m is now error in rtt est */ -		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */ +	if (srtt != 0) { +		m -= (srtt >> 3);	/* m is now error in rtt est */ +		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */  		if (m < 0) {  			m = -m;		/* m is now abs(error) */ -			m -= (tp->mdev >> 2);   /* similar update on mdev */ +			m -= (tp->mdev_us >> 2);   /* similar update on mdev */  			/* This is similar to one of Eifel findings.  			 * Eifel blocks mdev updates when rtt decreases.  			 * This solution is a bit different: we use finer gain @@ -665,27 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  			if (m > 0)  				m >>= 3;  		} else { -			m -= (tp->mdev >> 2);   /* similar update on mdev */ +			m -= (tp->mdev_us >> 2);   /* similar update on mdev */  		} -		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */ -		if (tp->mdev > tp->mdev_max) { -			tp->mdev_max = tp->mdev; -			if (tp->mdev_max > tp->rttvar) -				tp->rttvar = tp->mdev_max; +		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */ +		if (tp->mdev_us > tp->mdev_max_us) { +			tp->mdev_max_us = tp->mdev_us; +			if (tp->mdev_max_us > tp->rttvar_us) +				tp->rttvar_us = tp->mdev_max_us;  		}  		if (after(tp->snd_una, tp->rtt_seq)) { -			if (tp->mdev_max < tp->rttvar) -				tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; +			if (tp->mdev_max_us < tp->rttvar_us) +				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;  			tp->rtt_seq = tp->snd_nxt; -			tp->mdev_max = tcp_rto_min(sk); +			tp->mdev_max_us = tcp_rto_min_us(sk);  		}  	} else {  		/* no previous measure. */ -		tp->srtt = m << 3;	/* take the measured time to be rtt */ -		tp->mdev = m << 1;	/* make sure rto = 3*rtt */ -		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); +		srtt = m << 3;		/* take the measured time to be rtt */ +		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */ +		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); +		tp->mdev_max_us = tp->rttvar_us;  		tp->rtt_seq = tp->snd_nxt;  	} +	tp->srtt_us = max(1U, srtt);  }  /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -700,26 +743,25 @@ static void tcp_update_pacing_rate(struct sock *sk)  	u64 rate;  	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ -	rate = (u64)tp->mss_cache * 2 * (HZ << 3); +	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);  	rate *= max(tp->snd_cwnd, tp->packets_out); -	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), -	 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) -	 * We probably need usec resolution in the future. -	 * Note: This also takes care of possible srtt=0 case, -	 * when tcp_rtt_estimator() was not yet called. -	 */ -	if (tp->srtt > 8 + 2) -		do_div(rate, tp->srtt); +	if (likely(tp->srtt_us)) +		do_div(rate, tp->srtt_us); -	sk->sk_pacing_rate = min_t(u64, rate, ~0U); +	/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate +	 * without any lock. We want to make sure compiler wont store +	 * intermediate values in this location. +	 */ +	ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, +						sk->sk_max_pacing_rate);  }  /* Calculate rto without backoff.  This is the second half of Van Jacobson's   * routine referred to above.   */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	/* Old crap is replaced with new one. 8) @@ -1064,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,  	}  	/* D-SACK for already forgotten data... Do dumb counting. */ -	if (dup_sack && tp->undo_marker && tp->undo_retrans && +	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&  	    !after(end_seq_0, prior_snd_una) &&  	    after(end_seq_0, tp->undo_marker))  		tp->undo_retrans--; @@ -1073,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,  }  struct tcp_sacktag_state { -	int reord; -	int fack_count; -	int flag; -	s32 rtt; /* RTT measured by SACKing never-retransmitted data */ +	int	reord; +	int	fack_count; +	long	rtt_us; /* RTT measured by SACKing never-retransmitted data */ +	int	flag;  };  /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1120,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  			unsigned int new_len = (pkt_len / mss) * mss;  			if (!in_sack && new_len < pkt_len) {  				new_len += mss; -				if (new_len > skb->len) +				if (new_len >= skb->len)  					return 0;  			}  			pkt_len = new_len;  		} -		err = tcp_fragment(sk, skb, pkt_len, mss); +		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);  		if (err < 0)  			return err;  	} @@ -1137,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  static u8 tcp_sacktag_one(struct sock *sk,  			  struct tcp_sacktag_state *state, u8 sacked,  			  u32 start_seq, u32 end_seq, -			  int dup_sack, int pcount, u32 xmit_time) +			  int dup_sack, int pcount, +			  const struct skb_mstamp *xmit_time)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int fack_count = state->fack_count;  	/* Account D-SACK for retransmitted packet. */  	if (dup_sack && (sacked & TCPCB_RETRANS)) { -		if (tp->undo_marker && tp->undo_retrans && +		if (tp->undo_marker && tp->undo_retrans > 0 &&  		    after(end_seq, tp->undo_marker))  			tp->undo_retrans--;  		if (sacked & TCPCB_SACKED_ACKED) @@ -1178,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,  				if (!after(end_seq, tp->high_seq))  					state->flag |= FLAG_ORIG_SACK_ACKED;  				/* Pick the earliest sequence sacked for RTT */ -				if (state->rtt < 0) -					state->rtt = tcp_time_stamp - xmit_time; +				if (state->rtt_us < 0) { +					struct skb_mstamp now; + +					skb_mstamp_get(&now); +					state->rtt_us = skb_mstamp_us_delta(&now, +								xmit_time); +				}  			}  			if (sacked & TCPCB_LOST) { @@ -1238,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,  	 */  	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,  			start_seq, end_seq, dup_sack, pcount, -			TCP_SKB_CB(skb)->when); +			&skb->skb_mstamp);  	if (skb == tp->lost_skb_hint)  		tp->lost_cnt_hint += pcount; @@ -1284,7 +1332,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,  		tp->lost_cnt_hint -= tcp_skb_pcount(prev);  	} -	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; +	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) +		TCP_SKB_CB(prev)->end_seq++; +  	if (skb == tcp_highest_sack(sk))  		tcp_advance_highest_sack(sk, skb); @@ -1513,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  						TCP_SKB_CB(skb)->end_seq,  						dup_sack,  						tcp_skb_pcount(skb), -						TCP_SKB_CB(skb)->when); +						&skb->skb_mstamp);  			if (!before(TCP_SKB_CB(skb)->seq,  				    tcp_highest_sack_seq(tp))) @@ -1570,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl  static int  tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, -			u32 prior_snd_una, s32 *sack_rtt) +			u32 prior_snd_una, long *sack_rtt_us)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1588,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,  	state.flag = 0;  	state.reord = tp->packets_out; -	state.rtt = -1; +	state.rtt_us = -1L;  	if (!tp->sacked_out) {  		if (WARN_ON(tp->fackets_out)) @@ -1772,7 +1823,7 @@ out:  	WARN_ON((int)tp->retrans_out < 0);  	WARN_ON((int)tcp_packets_in_flight(tp) < 0);  #endif -	*sack_rtt = state.rtt; +	*sack_rtt_us = state.rtt_us;  	return state.flag;  } @@ -1842,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)  	tp->lost_out = 0;  	tp->undo_marker = 0; -	tp->undo_retrans = 0; +	tp->undo_retrans = -1;  }  void tcp_clear_retrans(struct tcp_sock *tp) @@ -1893,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how)  		if (skb == tcp_send_head(sk))  			break; -		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) +		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)  			tp->undo_marker = 0; +  		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;  		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -1982,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)  	 * available, or RTO is scheduled to fire first.  	 */  	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || -	    (flag & FLAG_ECE) || !tp->srtt) +	    (flag & FLAG_ECE) || !tp->srtt_us)  		return false; -	delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); +	delay = max(usecs_to_jiffies(tp->srtt_us >> 5), +		    msecs_to_jiffies(2)); +  	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))  		return false; @@ -2187,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  				break;  			mss = skb_shinfo(skb)->gso_size; -			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); +			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, +					   mss, GFP_ATOMIC);  			if (err < 0)  				break;  			cnt = packets; @@ -2610,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  	tp->prior_ssthresh = 0;  	tp->undo_marker = tp->snd_una; -	tp->undo_retrans = tp->retrans_out; +	tp->undo_retrans = tp->retrans_out ? : -1;  	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {  		if (!ece_ack) @@ -2630,13 +2685,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)  	bool recovered = !before(tp->snd_una, tp->high_seq);  	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ -		if (flag & FLAG_ORIG_SACK_ACKED) { -			/* Step 3.b. A timeout is spurious if not all data are -			 * lost, i.e., never-retransmitted data are (s)acked. -			 */ -			tcp_try_undo_loss(sk, true); +		/* Step 3.b. A timeout is spurious if not all data are +		 * lost, i.e., never-retransmitted data are (s)acked. +		 */ +		if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))  			return; -		} +  		if (after(tp->snd_nxt, tp->high_seq) &&  		    (flag & FLAG_DATA_SACKED || is_dupack)) {  			tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2832,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  }  static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, -				      s32 seq_rtt, s32 sack_rtt) +				      long seq_rtt_us, long sack_rtt_us)  {  	const struct tcp_sock *tp = tcp_sk(sk); @@ -2842,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  	 * is acked (RFC6298).  	 */  	if (flag & FLAG_RETRANS_DATA_ACKED) -		seq_rtt = -1; +		seq_rtt_us = -1L; -	if (seq_rtt < 0) -		seq_rtt = sack_rtt; +	if (seq_rtt_us < 0) +		seq_rtt_us = sack_rtt_us;  	/* RTTM Rule: A TSecr value received in a segment is used to  	 * update the averaged RTT measurement only if the segment @@ -2853,13 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  	 * left edge of the send window.  	 * See draft-ietf-tcplw-high-performance-00, section 3.3.  	 */ -	if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) -		seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; +	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && +	    flag & FLAG_ACKED) +		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); -	if (seq_rtt < 0) +	if (seq_rtt_us < 0)  		return false; -	tcp_rtt_estimator(sk, seq_rtt); +	tcp_rtt_estimator(sk, seq_rtt_us);  	tcp_set_rto(sk);  	/* RFC6298: only reset backoff on valid RTT measurement. */ @@ -2868,20 +2923,26 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  }  /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)  {  	struct tcp_sock *tp = tcp_sk(sk); -	s32 seq_rtt = -1; +	long seq_rtt_us = -1L; + +	if (synack_stamp && !tp->total_retrans) +		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); -	if (tp->lsndtime && !tp->total_retrans) -		seq_rtt = tcp_time_stamp - tp->lsndtime; -	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); +	/* If the ACK acks both the SYNACK and the (Fast Open'd) data packets +	 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() +	 */ +	if (!tp->srtt_us) +		tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);  } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); -	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); + +	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);  	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;  } @@ -2964,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)   * arrived at the other end.   */  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, -			       u32 prior_snd_una, s32 sack_rtt) +			       u32 prior_snd_una, long sack_rtt_us)  { -	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk); +	struct skb_mstamp first_ackt, last_ackt, now; +	struct tcp_sock *tp = tcp_sk(sk); +	u32 prior_sacked = tp->sacked_out; +	u32 reord = tp->packets_out; +	bool fully_acked = true; +	long ca_seq_rtt_us = -1L; +	long seq_rtt_us = -1L;  	struct sk_buff *skb; -	u32 now = tcp_time_stamp; -	int fully_acked = true; -	int flag = 0;  	u32 pkts_acked = 0; -	u32 reord = tp->packets_out; -	u32 prior_sacked = tp->sacked_out; -	s32 seq_rtt = -1; -	s32 ca_seq_rtt = -1; -	ktime_t last_ackt = net_invalid_timestamp(); +	bool rtt_update; +	int flag = 0; + +	first_ackt.v64 = 0;  	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {  		struct tcp_skb_cb *scb = TCP_SKB_CB(skb); -		u32 acked_pcount;  		u8 sacked = scb->sacked; +		u32 acked_pcount;  		/* Determine how many packets and what bytes were acked, tso and else */  		if (after(scb->end_seq, tp->snd_una)) { @@ -3004,11 +3067,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  				tp->retrans_out -= acked_pcount;  			flag |= FLAG_RETRANS_DATA_ACKED;  		} else { -			ca_seq_rtt = now - scb->when; -			last_ackt = skb->tstamp; -			if (seq_rtt < 0) { -				seq_rtt = ca_seq_rtt; -			} +			last_ackt = skb->skb_mstamp; +			WARN_ON_ONCE(last_ackt.v64 == 0); +			if (!first_ackt.v64) +				first_ackt = last_ackt; +  			if (!(sacked & TCPCB_SACKED_ACKED))  				reord = min(pkts_acked, reord);  			if (!after(scb->end_seq, tp->high_seq)) @@ -3054,14 +3117,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))  		flag |= FLAG_SACK_RENEGING; -	if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || -	    (flag & FLAG_ACKED)) -		tcp_rearm_rto(sk); +	skb_mstamp_get(&now); +	if (first_ackt.v64) { +		seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); +		ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); +	} + +	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);  	if (flag & FLAG_ACKED) {  		const struct tcp_congestion_ops *ca_ops  			= inet_csk(sk)->icsk_ca_ops; +		tcp_rearm_rto(sk);  		if (unlikely(icsk->icsk_mtup.probe_size &&  			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {  			tcp_mtup_probe_success(sk); @@ -3083,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		tp->fackets_out -= min(pkts_acked, tp->fackets_out); -		if (ca_ops->pkts_acked) { -			s32 rtt_us = -1; - -			/* Is the ACK triggering packet unambiguous? */ -			if (!(flag & FLAG_RETRANS_DATA_ACKED)) { -				/* High resolution needed and available? */ -				if (ca_ops->flags & TCP_CONG_RTT_STAMP && -				    !ktime_equal(last_ackt, -						 net_invalid_timestamp())) -					rtt_us = ktime_us_delta(ktime_get_real(), -								last_ackt); -				else if (ca_seq_rtt >= 0) -					rtt_us = jiffies_to_usecs(ca_seq_rtt); -			} +		if (ca_ops->pkts_acked) +			ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); -			ca_ops->pkts_acked(sk, pkts_acked, rtt_us); -		} +	} else if (skb && rtt_update && sack_rtt_us >= 0 && +		   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { +		/* Do not re-arm RTO if the sack RTT is measured from data sent +		 * after when the head was last (re)transmitted. Otherwise the +		 * timeout may continue to extend in loss recovery. +		 */ +		tcp_rearm_rto(sk);  	}  #if FASTRETRANS_DEBUG > 0 @@ -3288,7 +3349,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  			tcp_init_cwnd_reduction(sk, true);  			tcp_set_ca_state(sk, TCP_CA_CWR);  			tcp_end_cwnd_reduction(sk); -			tcp_set_ca_state(sk, TCP_CA_Open); +			tcp_try_keep_open(sk);  			NET_INC_STATS_BH(sock_net(sk),  					 LINUX_MIB_TCPLOSSPROBERECOVERY);  		} @@ -3304,12 +3365,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	u32 ack_seq = TCP_SKB_CB(skb)->seq;  	u32 ack = TCP_SKB_CB(skb)->ack_seq;  	bool is_dupack = false; -	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;  	u32 prior_fackets;  	int prior_packets = tp->packets_out;  	const int prior_unsacked = tp->packets_out - tp->sacked_out;  	int acked = 0; /* Number of packets newly acked */ -	s32 sack_rtt = -1; +	long sack_rtt_us = -1L;  	/* If the ack is older than previous acks  	 * then we can probably ignore it. @@ -3337,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		flag |= FLAG_SND_UNA_ADVANCED;  	prior_fackets = tp->fackets_out; -	prior_in_flight = tcp_packets_in_flight(tp);  	/* ts_recent update must be made after we are sure that the packet  	 * is in window. @@ -3367,7 +3426,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		if (TCP_SKB_CB(skb)->sacked)  			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, -							&sack_rtt); +							&sack_rtt_us);  		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))  			flag |= FLAG_ECE; @@ -3386,12 +3445,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	/* See if we can take anything off of the retransmit queue. */  	acked = tp->packets_out; -	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); +	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, +				    sack_rtt_us);  	acked -= tp->packets_out;  	/* Advance cwnd if state allows */  	if (tcp_may_raise_cwnd(sk, flag)) -		tcp_cong_avoid(sk, ack, prior_in_flight); +		tcp_cong_avoid(sk, ack, acked);  	if (tcp_ack_is_dubious(sk, flag)) {  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3409,8 +3469,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (icsk->icsk_pending == ICSK_TIME_RETRANS)  		tcp_schedule_loss_probe(sk); -	if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) -		tcp_update_pacing_rate(sk); +	tcp_update_pacing_rate(sk);  	return 1;  no_queue: @@ -3439,7 +3498,7 @@ old_ack:  	 */  	if (TCP_SKB_CB(skb)->sacked) {  		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, -						&sack_rtt); +						&sack_rtt_us);  		tcp_fastretrans_alert(sk, acked, prior_unsacked,  				      is_dupack, flag);  	} @@ -3623,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)  		int opcode = *ptr++;  		int opsize; -		switch(opcode) { +		switch (opcode) {  		case TCPOPT_EOL:  			return NULL;  		case TCPOPT_NOP: @@ -3983,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)  			WARN_ON(before(tp->rcv_nxt, sp->end_seq));  			/* Zap this SACK, by moving forward any other SACKS. */ -			for (i=this_sack+1; i < num_sacks; i++) +			for (i = this_sack+1; i < num_sacks; i++)  				tp->selective_acks[i-1] = tp->selective_acks[i];  			num_sacks--;  			continue; @@ -4353,7 +4412,7 @@ queue_and_out:  		if (eaten > 0)  			kfree_skb_partial(skb, fragstolen);  		if (!sock_flag(sk, SOCK_DEAD)) -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  		return;  	} @@ -4643,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk)  	return -1;  } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); - -	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && -	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { -		/* Limited by application or receiver window. */ -		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); -		u32 win_used = max(tp->snd_cwnd_used, init_win); -		if (win_used < tp->snd_cwnd) { -			tp->snd_ssthresh = tcp_current_ssthresh(sk); -			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; -		} -		tp->snd_cwnd_used = 0; -	} -	tp->snd_cwnd_stamp = tcp_time_stamp; -} -  static bool tcp_should_expand_sndbuf(const struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk); @@ -4701,15 +4738,7 @@ static void tcp_new_space(struct sock *sk)  	struct tcp_sock *tp = tcp_sk(sk);  	if (tcp_should_expand_sndbuf(sk)) { -		int sndmem = SKB_TRUESIZE(max_t(u32, -						tp->rx_opt.mss_clamp, -						tp->mss_cache) + -					  MAX_TCP_HEADER); -		int demanded = max_t(unsigned int, tp->snd_cwnd, -				     tp->reordering + 1); -		sndmem *= 2 * demanded; -		if (sndmem > sk->sk_sndbuf) -			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); +		tcp_sndbuf_expand(sk);  		tp->snd_cwnd_stamp = tcp_time_stamp;  	} @@ -4862,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t  				BUG();  			tp->urg_data = TCP_URG_VALID | tmp;  			if (!sock_flag(sk, SOCK_DEAD)) -				sk->sk_data_ready(sk, 0); +				sk->sk_data_ready(sk);  		}  	}  } @@ -4948,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,  		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||  		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {  			tp->ucopy.wakeup = 1; -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  		}  	} else if (chunk > 0) {  		tp->ucopy.wakeup = 1; -		sk->sk_data_ready(sk, 0); +		sk->sk_data_ready(sk);  	}  out:  	return copied_early; @@ -5223,7 +5252,7 @@ no_ack:  #endif  			if (eaten)  				kfree_skb_partial(skb, fragstolen); -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  			return;  		}  	} @@ -5343,9 +5372,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,  				break;  		}  		tcp_rearm_rto(sk); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);  		return true;  	}  	tp->syn_data_acked = tp->syn_data; +	if (tp->syn_data_acked) +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);  	return false;  } @@ -5584,6 +5616,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  	struct request_sock *req;  	int queued = 0;  	bool acceptable; +	u32 synack_stamp;  	tp->rx_opt.saw_tstamp = 0; @@ -5666,16 +5699,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		 * so release it.  		 */  		if (req) { +			synack_stamp = tcp_rsk(req)->snt_synack;  			tp->total_retrans = req->num_retrans;  			reqsk_fastopen_remove(sk, req, false);  		} else { +			synack_stamp = tp->lsndtime;  			/* Make sure socket is routed, for correct metrics. */  			icsk->icsk_af_ops->rebuild_header(sk);  			tcp_init_congestion_control(sk);  			tcp_mtup_init(sk); -			tcp_init_buffer_space(sk);  			tp->copied_seq = tp->rcv_nxt; +			tcp_init_buffer_space(sk);  		}  		smp_mb();  		tcp_set_state(sk, TCP_ESTABLISHED); @@ -5691,7 +5726,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;  		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;  		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); -		tcp_synack_rtt_meas(sk, req); +		tcp_synack_rtt_meas(sk, synack_stamp);  		if (tp->rx_opt.tstamp_ok)  			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -5709,6 +5744,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		} else  			tcp_init_metrics(sk); +		tcp_update_pacing_rate(sk); +  		/* Prevent spurious tcp_cwnd_restart() on first data packet */  		tp->lsndtime = tcp_time_stamp;  | 
