diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 1534 | 
1 files changed, 998 insertions, 536 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 749b6498588..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -34,6 +34,8 @@   *   */ +#define pr_fmt(fmt) "TCP: " fmt +  #include <net/tcp.h>  #include <linux/compiler.h> @@ -48,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;   */  int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +  /* This limits the percentage of the congestion window which we   * will allow a single TSO frame to consume.  Building TSO frames   * which are too large can cause TCP streams to be bursty. @@ -60,27 +65,30 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;  /* By default, RFC2861 behavior.  */  int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ -EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +			   int push_one, gfp_t gfp);  /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)  { +	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	unsigned int prior_packets = tp->packets_out;  	tcp_advance_send_head(sk, skb);  	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; -	/* Don't override Nagle indefinately with F-RTO */ -	if (tp->frto_counter == 2) -		tp->frto_counter = 3; -  	tp->packets_out += tcp_skb_pcount(skb); -	if (!prior_packets) -		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { +		tcp_rearm_rto(sk); +	} + +	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, +		      tcp_skb_pcount(skb));  }  /* SND.NXT, if window was not shrunk. @@ -89,9 +97,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)   * Anything in between SND.UNA...SND.UNA+SND.WND also can be already   * invalid. OK, let's make this for now:   */ -static inline __u32 tcp_acceptable_seq(struct sock *sk) +static inline __u32 tcp_acceptable_seq(const struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	if (!before(tcp_wnd_end(tp), tp->snd_nxt))  		return tp->snd_nxt; @@ -116,12 +124,16 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk)  static __u16 tcp_advertise_mss(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); -	struct dst_entry *dst = __sk_dst_get(sk); +	const struct dst_entry *dst = __sk_dst_get(sk);  	int mss = tp->advmss; -	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { -		mss = dst_metric(dst, RTAX_ADVMSS); -		tp->advmss = mss; +	if (dst) { +		unsigned int metric = dst_metric_advmss(dst); + +		if (metric < mss) { +			mss = metric; +			tp->advmss = mss; +		}  	}  	return (__u16)mss; @@ -129,7 +141,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)  /* RFC2861. Reset CWND after idle period longer RTO to "restart window".   * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)  {  	struct tcp_sock *tp = tcp_sk(sk);  	s32 delta = tcp_time_stamp - tp->lsndtime; @@ -150,10 +162,11 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)  /* Congestion state accounting after a packet has been sent. */  static void tcp_event_data_sent(struct tcp_sock *tp, -				struct sk_buff *skb, struct sock *sk) +				struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	const u32 now = tcp_time_stamp; +	const struct dst_entry *dst = __sk_dst_get(sk);  	if (sysctl_tcp_slow_start_after_idle &&  	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -164,8 +177,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,  	/* If it is a reply for ato after last received  	 * packet, enter pingpong mode.  	 */ -	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) -		icsk->icsk_ack.pingpong = 1; +	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && +	    (!dst || !dst_metric(dst, RTAX_QUICKACK))) +			icsk->icsk_ack.pingpong = 1;  }  /* Account for an ACK we sent. */ @@ -175,6 +189,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)  	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);  } + +u32 tcp_default_init_rwnd(u32 mss) +{ +	/* Initial receive window should be twice of TCP_INIT_CWND to +	 * enable proper sending of new unsent data during fast recovery +	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a +	 * limit when mss is larger than 1460. +	 */ +	u32 init_rwnd = TCP_INIT_CWND * 2; + +	if (mss > 1460) +		init_rwnd = max((1460 * init_rwnd) / mss, 2U); +	return init_rwnd; +} +  /* Determine a window scaling and initial window to offer.   * Based on the assumption that the given amount of space   * will be offered. Store the results in the tp structure. @@ -224,18 +253,10 @@ void tcp_select_initial_window(int __space, __u32 mss,  		}  	} -	/* Set initial window to value enough for senders, following RFC5681. */  	if (mss > (1 << *rcv_wscale)) { -		int init_cwnd = rfc3390_bytes_to_packets(mss); - -		/* when initializing use the value from init_rcv_wnd -		 * rather than the default from above -		 */ -		if (init_rcv_wnd && -		    (*rcv_wnd > init_rcv_wnd * mss)) -			*rcv_wnd = init_rcv_wnd * mss; -		else if (*rcv_wnd > init_cwnd * mss) -			*rcv_wnd = init_cwnd * mss; +		if (!init_rcv_wnd) /* Use default unless specified otherwise */ +			init_rcv_wnd = tcp_default_init_rwnd(mss); +		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);  	}  	/* Set the clamp no higher than max representable value */ @@ -251,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);  static u16 tcp_select_window(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	u32 old_win = tp->rcv_wnd;  	u32 cur_win = tcp_receive_window(tp);  	u32 new_win = __tcp_select_window(sk); @@ -263,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)  		 *  		 * Relax Will Robinson.  		 */ +		if (new_win == 0) +			NET_INC_STATS(sock_net(sk), +				      LINUX_MIB_TCPWANTZEROWINDOWADV);  		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);  	}  	tp->rcv_wnd = new_win; @@ -280,18 +305,24 @@ static u16 tcp_select_window(struct sock *sk)  	new_win >>= tp->rx_opt.rcv_wscale;  	/* If we advertise zero window, disable fast path. */ -	if (new_win == 0) +	if (new_win == 0) {  		tp->pred_flags = 0; +		if (old_win) +			NET_INC_STATS(sock_net(sk), +				      LINUX_MIB_TCPTOZEROWINDOWADV); +	} else if (old_win == 0) { +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); +	}  	return new_win;  }  /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)  { -	TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; +	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;  	if (!(tp->ecn_flags & TCP_ECN_OK)) -		TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; +		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;  }  /* Packet ECN state for a SYN.  */ @@ -300,14 +331,14 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)  	struct tcp_sock *tp = tcp_sk(sk);  	tp->ecn_flags = 0; -	if (sysctl_tcp_ecn == 1) { -		TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; +	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;  		tp->ecn_flags = TCP_ECN_OK;  	}  }  static __inline__ void -TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)  {  	if (inet_rsk(req)->ecn_ok)  		th->ece = 1; @@ -345,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,   */  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)  { +	struct skb_shared_info *shinfo = skb_shinfo(skb); +  	skb->ip_summed = CHECKSUM_PARTIAL;  	skb->csum = 0; -	TCP_SKB_CB(skb)->flags = flags; +	TCP_SKB_CB(skb)->tcp_flags = flags;  	TCP_SKB_CB(skb)->sacked = 0; -	skb_shinfo(skb)->gso_segs = 1; -	skb_shinfo(skb)->gso_size = 0; -	skb_shinfo(skb)->gso_type = 0; +	shinfo->gso_segs = 1; +	shinfo->gso_size = 0; +	shinfo->gso_type = 0;  	TCP_SKB_CB(skb)->seq = seq;  	if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -361,7 +394,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)  	TCP_SKB_CB(skb)->end_seq = seq;  } -static inline int tcp_urg_mode(const struct tcp_sock *tp) +static inline bool tcp_urg_mode(const struct tcp_sock *tp)  {  	return tp->snd_una != tp->snd_up;  } @@ -370,51 +403,25 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)  #define OPTION_TS		(1 << 1)  #define OPTION_MD5		(1 << 2)  #define OPTION_WSCALE		(1 << 3) -#define OPTION_COOKIE_EXTENSION	(1 << 4) +#define OPTION_FAST_OPEN_COOKIE	(1 << 8)  struct tcp_out_options { -	u8 options;		/* bit field of OPTION_* */ +	u16 options;		/* bit field of OPTION_* */ +	u16 mss;		/* 0 to disable */  	u8 ws;			/* window scale, 0 to disable */  	u8 num_sack_blocks;	/* number of SACK blocks to include */  	u8 hash_size;		/* bytes in hash_location */ -	u16 mss;		/* 0 to disable */ -	__u32 tsval, tsecr;	/* need to include OPTION_TS */  	__u8 *hash_location;	/* temporary pointer, overloaded */ +	__u32 tsval, tsecr;	/* need to include OPTION_TS */ +	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */  }; -/* The sysctl int routines are generic, so check consistency here. - */ -static u8 tcp_cookie_size_check(u8 desired) -{ -	if (desired > 0) { -		/* previously specified */ -		return desired; -	} -	if (sysctl_tcp_cookie_size <= 0) { -		/* no default specified */ -		return 0; -	} -	if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { -		/* value too small, specify minimum */ -		return TCP_COOKIE_MIN; -	} -	if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { -		/* value too large, specify maximum */ -		return TCP_COOKIE_MAX; -	} -	if (0x1 & sysctl_tcp_cookie_size) { -		/* 8-bit multiple, illegal, fix it */ -		return (u8)(sysctl_tcp_cookie_size + 0x1); -	} -	return (u8)sysctl_tcp_cookie_size; -} -  /* Write previously computed TCP options to the packet.   *   * Beware: Something in the Internet is very sensitive to the ordering of   * TCP options, we learned this through the hard way, so be careful here.   * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with   * the ordering which we have been using if we want to keep working with   * those broken things (not that it currently hurts anybody as there isn't   * particular reason why the ordering would need to be changed). @@ -425,29 +432,11 @@ static u8 tcp_cookie_size_check(u8 desired)  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  			      struct tcp_out_options *opts)  { -	u8 options = opts->options;	/* mungable copy */ +	u16 options = opts->options;	/* mungable copy */ -	/* Having both authentication and cookies for security is redundant, -	 * and there's certainly not enough room.  Instead, the cookie-less -	 * extension variant is proposed. -	 * -	 * Consider the pessimal case with authentication.  The options -	 * could look like: -	 *   COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 -	 */  	if (unlikely(OPTION_MD5 & options)) { -		if (unlikely(OPTION_COOKIE_EXTENSION & options)) { -			*ptr++ = htonl((TCPOPT_COOKIE << 24) | -				       (TCPOLEN_COOKIE_BASE << 16) | -				       (TCPOPT_MD5SIG << 8) | -				       TCPOLEN_MD5SIG); -		} else { -			*ptr++ = htonl((TCPOPT_NOP << 24) | -				       (TCPOPT_NOP << 16) | -				       (TCPOPT_MD5SIG << 8) | -				       TCPOLEN_MD5SIG); -		} -		options &= ~OPTION_COOKIE_EXTENSION; +		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | +			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);  		/* overload cookie hash location */  		opts->hash_location = (__u8 *)ptr;  		ptr += 4; @@ -476,44 +465,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  		*ptr++ = htonl(opts->tsecr);  	} -	/* Specification requires after timestamp, so do it now. -	 * -	 * Consider the pessimal case without authentication.  The options -	 * could look like: -	 *   MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 -	 */ -	if (unlikely(OPTION_COOKIE_EXTENSION & options)) { -		__u8 *cookie_copy = opts->hash_location; -		u8 cookie_size = opts->hash_size; - -		/* 8-bit multiple handled in tcp_cookie_size_check() above, -		 * and elsewhere. -		 */ -		if (0x2 & cookie_size) { -			__u8 *p = (__u8 *)ptr; - -			/* 16-bit multiple */ -			*p++ = TCPOPT_COOKIE; -			*p++ = TCPOLEN_COOKIE_BASE + cookie_size; -			*p++ = *cookie_copy++; -			*p++ = *cookie_copy++; -			ptr++; -			cookie_size -= 2; -		} else { -			/* 32-bit multiple */ -			*ptr++ = htonl(((TCPOPT_NOP << 24) | -					(TCPOPT_NOP << 16) | -					(TCPOPT_COOKIE << 8) | -					TCPOLEN_COOKIE_BASE) + -				       cookie_size); -		} - -		if (cookie_size > 0) { -			memcpy(ptr, cookie_copy, cookie_size); -			ptr += (cookie_size / 4); -		} -	} -  	if (unlikely(OPTION_SACK_ADVERTISE & options)) {  		*ptr++ = htonl((TCPOPT_NOP << 24) |  			       (TCPOPT_NOP << 16) | @@ -547,20 +498,33 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  		tp->rx_opt.dsack = 0;  	} + +	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { +		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + +		*ptr++ = htonl((TCPOPT_EXP << 24) | +			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | +			       TCPOPT_FASTOPEN_MAGIC); + +		memcpy(ptr, foc->val, foc->len); +		if ((foc->len & 3) == 2) { +			u8 *align = ((u8 *)ptr) + foc->len; +			align[0] = align[1] = TCPOPT_NOP; +		} +		ptr += (foc->len + 3) >> 2; +	}  }  /* Compute TCP options for SYN packets. This is not the final   * network wire format yet.   */ -static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,  				struct tcp_out_options *opts, -				struct tcp_md5sig_key **md5) { +				struct tcp_md5sig_key **md5) +{  	struct tcp_sock *tp = tcp_sk(sk); -	struct tcp_cookie_values *cvp = tp->cookie_values; -	unsigned remaining = MAX_TCP_OPTION_SPACE; -	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? -			 tcp_cookie_size_check(cvp->cookie_desired) : -			 0; +	unsigned int remaining = MAX_TCP_OPTION_SPACE; +	struct tcp_fastopen_request *fastopen = tp->fastopen_req;  #ifdef CONFIG_TCP_MD5SIG  	*md5 = tp->af_specific->md5_lookup(sk, sk); @@ -586,7 +550,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,  	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {  		opts->options |= OPTION_TS; -		opts->tsval = TCP_SKB_CB(skb)->when; +		opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;  		opts->tsecr = tp->rx_opt.ts_recent;  		remaining -= TCPOLEN_TSTAMP_ALIGNED;  	} @@ -601,68 +565,30 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,  			remaining -= TCPOLEN_SACKPERM_ALIGNED;  	} -	/* Note that timestamps are required by the specification. -	 * -	 * Odd numbers of bytes are prohibited by the specification, ensuring -	 * that the cookie is 16-bit aligned, and the resulting cookie pair is -	 * 32-bit aligned. -	 */ -	if (*md5 == NULL && -	    (OPTION_TS & opts->options) && -	    cookie_size > 0) { -		int need = TCPOLEN_COOKIE_BASE + cookie_size; - -		if (0x2 & need) { -			/* 32-bit multiple */ -			need += 2; /* NOPs */ - -			if (need > remaining) { -				/* try shrinking cookie to fit */ -				cookie_size -= 2; -				need -= 4; -			} -		} -		while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { -			cookie_size -= 4; -			need -= 4; -		} -		if (TCP_COOKIE_MIN <= cookie_size) { -			opts->options |= OPTION_COOKIE_EXTENSION; -			opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; -			opts->hash_size = cookie_size; - -			/* Remember for future incarnations. */ -			cvp->cookie_desired = cookie_size; - -			if (cvp->cookie_desired != cvp->cookie_pair_size) { -				/* Currently use random bytes as a nonce, -				 * assuming these are completely unpredictable -				 * by hostile users of the same system. -				 */ -				get_random_bytes(&cvp->cookie_pair[0], -						 cookie_size); -				cvp->cookie_pair_size = cookie_size; -			} - +	if (fastopen && fastopen->cookie.len >= 0) { +		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; +		need = (need + 3) & ~3U;  /* Align to 32 bits */ +		if (remaining >= need) { +			opts->options |= OPTION_FAST_OPEN_COOKIE; +			opts->fastopen_cookie = &fastopen->cookie;  			remaining -= need; +			tp->syn_fastopen = 1;  		}  	} +  	return MAX_TCP_OPTION_SPACE - remaining;  }  /* Set up TCP options for SYN-ACKs. */ -static unsigned tcp_synack_options(struct sock *sk, +static unsigned int tcp_synack_options(struct sock *sk,  				   struct request_sock *req, -				   unsigned mss, struct sk_buff *skb, +				   unsigned int mss, struct sk_buff *skb,  				   struct tcp_out_options *opts,  				   struct tcp_md5sig_key **md5, -				   struct tcp_extend_values *xvp) +				   struct tcp_fastopen_cookie *foc)  {  	struct inet_request_sock *ireq = inet_rsk(req); -	unsigned remaining = MAX_TCP_OPTION_SPACE; -	u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? -			 xvp->cookie_plus : -			 0; +	unsigned int remaining = MAX_TCP_OPTION_SPACE;  #ifdef CONFIG_TCP_MD5SIG  	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); @@ -701,43 +627,33 @@ static unsigned tcp_synack_options(struct sock *sk,  		if (unlikely(!ireq->tstamp_ok))  			remaining -= TCPOLEN_SACKPERM_ALIGNED;  	} - -	/* Similar rationale to tcp_syn_options() applies here, too. -	 * If the <SYN> options fit, the same options should fit now! -	 */ -	if (*md5 == NULL && -	    ireq->tstamp_ok && -	    cookie_plus > TCPOLEN_COOKIE_BASE) { -		int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ - -		if (0x2 & need) { -			/* 32-bit multiple */ -			need += 2; /* NOPs */ -		} -		if (need <= remaining) { -			opts->options |= OPTION_COOKIE_EXTENSION; -			opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; +	if (foc != NULL && foc->len >= 0) { +		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; +		need = (need + 3) & ~3U;  /* Align to 32 bits */ +		if (remaining >= need) { +			opts->options |= OPTION_FAST_OPEN_COOKIE; +			opts->fastopen_cookie = foc;  			remaining -= need; -		} else { -			/* There's no error return, so flag it. */ -			xvp->cookie_out_never = 1; /* true */ -			opts->hash_size = 0;  		}  	} +  	return MAX_TCP_OPTION_SPACE - remaining;  }  /* Compute TCP options for ESTABLISHED sockets. This is not the   * final wire format yet.   */ -static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,  					struct tcp_out_options *opts, -					struct tcp_md5sig_key **md5) { +					struct tcp_md5sig_key **md5) +{  	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;  	struct tcp_sock *tp = tcp_sk(sk); -	unsigned size = 0; +	unsigned int size = 0;  	unsigned int eff_sacks; +	opts->options = 0; +  #ifdef CONFIG_TCP_MD5SIG  	*md5 = tp->af_specific->md5_lookup(sk, sk);  	if (unlikely(*md5)) { @@ -750,16 +666,16 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,  	if (likely(tp->rx_opt.tstamp_ok)) {  		opts->options |= OPTION_TS; -		opts->tsval = tcb ? tcb->when : 0; +		opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;  		opts->tsecr = tp->rx_opt.ts_recent;  		size += TCPOLEN_TSTAMP_ALIGNED;  	}  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;  	if (unlikely(eff_sacks)) { -		const unsigned remaining = MAX_TCP_OPTION_SPACE - size; +		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;  		opts->num_sack_blocks = -			min_t(unsigned, eff_sacks, +			min_t(unsigned int, eff_sacks,  			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /  			      TCPOLEN_SACK_PERBLOCK);  		size += TCPOLEN_SACK_BASE_ALIGNED + @@ -769,6 +685,172 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,  	return size;  } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { +	struct tasklet_struct	tasklet; +	struct list_head	head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +static void tcp_tsq_handler(struct sock *sk) +{ +	if ((1 << sk->sk_state) & +	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | +	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) +		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, +			       0, GFP_ATOMIC); +} +/* + * One tasklet per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transferring tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ +	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; +	LIST_HEAD(list); +	unsigned long flags; +	struct list_head *q, *n; +	struct tcp_sock *tp; +	struct sock *sk; + +	local_irq_save(flags); +	list_splice_init(&tsq->head, &list); +	local_irq_restore(flags); + +	list_for_each_safe(q, n, &list) { +		tp = list_entry(q, struct tcp_sock, tsq_node); +		list_del(&tp->tsq_node); + +		sk = (struct sock *)tp; +		bh_lock_sock(sk); + +		if (!sock_owned_by_user(sk)) { +			tcp_tsq_handler(sk); +		} else { +			/* defer the work to tcp_release_cb() */ +			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); +		} +		bh_unlock_sock(sk); + +		clear_bit(TSQ_QUEUED, &tp->tsq_flags); +		sk_free(sk); +	} +} + +#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\ +			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\ +			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\ +			  (1UL << TCP_MTU_REDUCED_DEFERRED)) +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	unsigned long flags, nflags; + +	/* perform an atomic operation only if at least one flag is set */ +	do { +		flags = tp->tsq_flags; +		if (!(flags & TCP_DEFERRED_ALL)) +			return; +		nflags = flags & ~TCP_DEFERRED_ALL; +	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + +	if (flags & (1UL << TCP_TSQ_DEFERRED)) +		tcp_tsq_handler(sk); + +	/* Here begins the tricky part : +	 * We are called from release_sock() with : +	 * 1) BH disabled +	 * 2) sk_lock.slock spinlock held +	 * 3) socket owned by us (sk->sk_lock.owned == 1) +	 * +	 * But following code is meant to be called from BH handlers, +	 * so we should keep BH disabled, but early release socket ownership +	 */ +	sock_release_ownership(sk); + +	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { +		tcp_write_timer_handler(sk); +		__sock_put(sk); +	} +	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { +		tcp_delack_timer_handler(sk); +		__sock_put(sk); +	} +	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { +		sk->sk_prot->mtu_reduced(sk); +		__sock_put(sk); +	} +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ +	int i; + +	for_each_possible_cpu(i) { +		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + +		INIT_LIST_HEAD(&tsq->head); +		tasklet_init(&tsq->tasklet, +			     tcp_tasklet_func, +			     (unsigned long)tsq); +	} +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We can't xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; +	struct tcp_sock *tp = tcp_sk(sk); + +	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && +	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { +		unsigned long flags; +		struct tsq_tasklet *tsq; + +		/* Keep a ref on socket. +		 * This last ref will be released in tcp_tasklet_func() +		 */ +		atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + +		/* queue this socket to tasklet queue */ +		local_irq_save(flags); +		tsq = &__get_cpu_var(tsq_tasklet); +		list_add(&tp->tsq_node, &tsq->head); +		tasklet_schedule(&tsq->tasklet); +		local_irq_restore(flags); +	} else { +		sock_wfree(skb); +	} +} +  /* This routine actually transmits TCP packets queued in by   * tcp_do_sendmsg().  This is used by both the initial   * transmission and possible later retransmissions. @@ -788,26 +870,24 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	struct tcp_sock *tp;  	struct tcp_skb_cb *tcb;  	struct tcp_out_options opts; -	unsigned tcp_options_size, tcp_header_size; +	unsigned int tcp_options_size, tcp_header_size;  	struct tcp_md5sig_key *md5;  	struct tcphdr *th;  	int err;  	BUG_ON(!skb || !tcp_skb_pcount(skb)); -	/* If congestion control is doing timestamping, we must -	 * take such a timestamp before we potentially clone/copy. -	 */ -	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) -		__net_timestamp(skb); +	if (clone_it) { +		skb_mstamp_get(&skb->skb_mstamp); -	if (likely(clone_it)) {  		if (unlikely(skb_cloned(skb)))  			skb = pskb_copy(skb, gfp_mask);  		else  			skb = skb_clone(skb, gfp_mask);  		if (unlikely(!skb))  			return -ENOBUFS; +		/* Our usage of tstamp should remain private */ +		skb->tstamp.tv64 = 0;  	}  	inet = inet_sk(sk); @@ -815,22 +895,28 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	tcb = TCP_SKB_CB(skb);  	memset(&opts, 0, sizeof(opts)); -	if (unlikely(tcb->flags & TCPHDR_SYN)) +	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);  	else  		tcp_options_size = tcp_established_options(sk, skb, &opts,  							   &md5);  	tcp_header_size = tcp_options_size + sizeof(struct tcphdr); -	if (tcp_packets_in_flight(tp) == 0) { +	if (tcp_packets_in_flight(tp) == 0)  		tcp_ca_event(sk, CA_EVENT_TX_START); -		skb->ooo_okay = 1; -	} else -		skb->ooo_okay = 0; + +	/* if no packet is in qdisc/device queue, then allow XPS to select +	 * another queue. +	 */ +	skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;  	skb_push(skb, tcp_header_size);  	skb_reset_transport_header(skb); -	skb_set_owner_w(skb, sk); + +	skb_orphan(skb); +	skb->sk = sk; +	skb->destructor = tcp_wfree; +	atomic_add(skb->truesize, &sk->sk_wmem_alloc);  	/* Build TCP header and checksum it. */  	th = tcp_hdr(skb); @@ -839,9 +925,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	th->seq			= htonl(tcb->seq);  	th->ack_seq		= htonl(tp->rcv_nxt);  	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | -					tcb->flags); +					tcb->tcp_flags); -	if (unlikely(tcb->flags & TCPHDR_SYN)) { +	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {  		/* RFC1323: The window in SYN & SYN/ACK segments  		 * is never scaled.  		 */ @@ -864,7 +950,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	}  	tcp_options_write((__be32 *)(th + 1), tp, &opts); -	if (likely((tcb->flags & TCPHDR_SYN) == 0)) +	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))  		TCP_ECN_send(sk, skb, tcp_header_size);  #ifdef CONFIG_TCP_MD5SIG @@ -878,17 +964,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	icsk->icsk_af_ops->send_check(sk, skb); -	if (likely(tcb->flags & TCPHDR_ACK)) +	if (likely(tcb->tcp_flags & TCPHDR_ACK))  		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));  	if (skb->len != tcp_header_size) -		tcp_event_data_sent(tp, skb, sk); +		tcp_event_data_sent(tp, sk);  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,  			      tcp_skb_pcount(skb)); -	err = icsk->icsk_af_ops->queue_xmit(skb); +	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);  	if (likely(err <= 0))  		return err; @@ -915,28 +1001,32 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)  }  /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, +static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,  				 unsigned int mss_now)  { -	if (skb->len <= mss_now || !sk_can_gso(sk) || -	    skb->ip_summed == CHECKSUM_NONE) { +	struct skb_shared_info *shinfo = skb_shinfo(skb); + +	/* Make sure we own this skb before messing gso_size/gso_segs */ +	WARN_ON_ONCE(skb_cloned(skb)); + +	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {  		/* Avoid the costly divide in the normal  		 * non-TSO case.  		 */ -		skb_shinfo(skb)->gso_segs = 1; -		skb_shinfo(skb)->gso_size = 0; -		skb_shinfo(skb)->gso_type = 0; +		shinfo->gso_segs = 1; +		shinfo->gso_size = 0; +		shinfo->gso_type = 0;  	} else { -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); -		skb_shinfo(skb)->gso_size = mss_now; -		skb_shinfo(skb)->gso_type = sk->sk_gso_type; +		shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); +		shinfo->gso_size = mss_now; +		shinfo->gso_type = sk->sk_gso_type;  	}  }  /* When a modification to fackets out becomes necessary, we need to check   * skb is counted to fackets_out or not.   */ -static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, +static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,  				   int decr)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -951,7 +1041,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,  /* Pcount in the middle of the write queue got changed, we need to do various   * tweaks to fix counters   */ -static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) +static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -984,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)   * Remember, these are still headerless SKBs at this point.   */  int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, -		 unsigned int mss_now) +		 unsigned int mss_now, gfp_t gfp)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *buff; @@ -992,19 +1082,18 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,  	int nlen;  	u8 flags; -	BUG_ON(len > skb->len); +	if (WARN_ON(len > skb->len)) +		return -EINVAL;  	nsize = skb_headlen(skb) - len;  	if (nsize < 0)  		nsize = 0; -	if (skb_cloned(skb) && -	    skb_is_nonlinear(skb) && -	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, gfp))  		return -ENOMEM;  	/* Get a new skb... force flag on. */ -	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); +	buff = sk_stream_alloc_skb(sk, nsize, gfp);  	if (buff == NULL)  		return -ENOMEM; /* We'll just try again later. */ @@ -1020,9 +1109,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;  	/* PSH and FIN should only be set in the second packet. */ -	flags = TCP_SKB_CB(skb)->flags; -	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); -	TCP_SKB_CB(buff)->flags = flags; +	flags = TCP_SKB_CB(skb)->tcp_flags; +	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); +	TCP_SKB_CB(buff)->tcp_flags = flags;  	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;  	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { @@ -1077,25 +1166,36 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,   */  static void __pskb_trim_head(struct sk_buff *skb, int len)  { +	struct skb_shared_info *shinfo;  	int i, k, eat; +	eat = min_t(int, len, skb_headlen(skb)); +	if (eat) { +		__skb_pull(skb, eat); +		len -= eat; +		if (!len) +			return; +	}  	eat = len;  	k = 0; -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -		if (skb_shinfo(skb)->frags[i].size <= eat) { -			put_page(skb_shinfo(skb)->frags[i].page); -			eat -= skb_shinfo(skb)->frags[i].size; +	shinfo = skb_shinfo(skb); +	for (i = 0; i < shinfo->nr_frags; i++) { +		int size = skb_frag_size(&shinfo->frags[i]); + +		if (size <= eat) { +			skb_frag_unref(skb, i); +			eat -= size;  		} else { -			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; +			shinfo->frags[k] = shinfo->frags[i];  			if (eat) { -				skb_shinfo(skb)->frags[k].page_offset += eat; -				skb_shinfo(skb)->frags[k].size -= eat; +				shinfo->frags[k].page_offset += eat; +				skb_frag_size_sub(&shinfo->frags[k], eat);  				eat = 0;  			}  			k++;  		}  	} -	skb_shinfo(skb)->nr_frags = k; +	shinfo->nr_frags = k;  	skb_reset_tail_pointer(skb);  	skb->data_len -= len; @@ -1105,14 +1205,10 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)  /* Remove acked data from a packet in the transmit queue. */  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)  { -	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, GFP_ATOMIC))  		return -ENOMEM; -	/* If len == headlen, we avoid __skb_pull to preserve alignment. */ -	if (unlikely(len < skb_headlen(skb))) -		__skb_pull(skb, len); -	else -		__pskb_trim_head(skb, len - skb_headlen(skb)); +	__pskb_trim_head(skb, len);  	TCP_SKB_CB(skb)->seq += len;  	skb->ip_summed = CHECKSUM_PARTIAL; @@ -1122,20 +1218,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)  	sk_mem_uncharge(sk, len);  	sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -	/* Any change of skb->len requires recalculation of tso -	 * factor and mss. -	 */ +	/* Any change of skb->len requires recalculation of tso factor. */  	if (tcp_skb_pcount(skb) > 1) -		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); +		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));  	return 0;  } -/* Calculate MSS. Not accounting for SACKs here.  */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options.  */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct inet_connection_sock *icsk = inet_csk(sk);  	int mss_now;  	/* Calculate base mss without TCP options: @@ -1143,6 +1237,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)  	 */  	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); +	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ +	if (icsk->icsk_af_ops->net_frag_header_len) { +		const struct dst_entry *dst = __sk_dst_get(sk); + +		if (dst && dst_allfrag(dst)) +			mss_now -= icsk->icsk_af_ops->net_frag_header_len; +	} +  	/* Clamp it (mss_clamp does not include tcp options) */  	if (mss_now > tp->rx_opt.mss_clamp)  		mss_now = tp->rx_opt.mss_clamp; @@ -1153,18 +1255,22 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)  	/* Then reserve room for full set of TCP options and 8 bytes of data */  	if (mss_now < 48)  		mss_now = 48; - -	/* Now subtract TCP options size, not including SACKs */ -	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); -  	return mss_now;  } +/* Calculate MSS. Not accounting for SACKs here.  */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ +	/* Subtract TCP options size, not including SACKs */ +	return __tcp_mtu_to_mss(sk, pmtu) - +	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} +  /* Inverse of above */  int tcp_mss_to_mtu(struct sock *sk, int mss)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct inet_connection_sock *icsk = inet_csk(sk);  	int mtu;  	mtu = mss + @@ -1172,6 +1278,13 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)  	      icsk->icsk_ext_hdr_len +  	      icsk->icsk_af_ops->net_header_len; +	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ +	if (icsk->icsk_af_ops->net_frag_header_len) { +		const struct dst_entry *dst = __sk_dst_get(sk); + +		if (dst && dst_allfrag(dst)) +			mtu += icsk->icsk_af_ops->net_frag_header_len; +	}  	return mtu;  } @@ -1238,10 +1351,10 @@ EXPORT_SYMBOL(tcp_sync_mss);   */  unsigned int tcp_current_mss(struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct dst_entry *dst = __sk_dst_get(sk); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct dst_entry *dst = __sk_dst_get(sk);  	u32 mss_now; -	unsigned header_len; +	unsigned int header_len;  	struct tcp_out_options opts;  	struct tcp_md5sig_key *md5; @@ -1267,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)  	return mss_now;  } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && +	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { +		/* Limited by application or receiver window. */ +		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); +		u32 win_used = max(tp->snd_cwnd_used, init_win); +		if (win_used < tp->snd_cwnd) { +			tp->snd_ssthresh = tcp_current_ssthresh(sk); +			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; +		} +		tp->snd_cwnd_used = 0; +	} +	tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (tp->packets_out >= tp->snd_cwnd) { +	/* Track the maximum number of outstanding packets in each +	 * window, and remember whether we were cwnd-limited then. +	 */ +	if (!before(tp->snd_una, tp->max_packets_seq) || +	    tp->packets_out > tp->max_packets_out) { +		tp->max_packets_out = tp->packets_out; +		tp->max_packets_seq = tp->snd_nxt; +		tp->is_cwnd_limited = is_cwnd_limited; +	} + +	if (tcp_is_cwnd_limited(sk)) {  		/* Network is feed fully. */  		tp->snd_cwnd_used = 0;  		tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1287,48 +1431,85 @@ static void tcp_cwnd_validate(struct sock *sk)  	}  } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ +	return after(tp->snd_sml, tp->snd_una) && +		!after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + *  skb_pcount = skb->len / mss_now   */ -static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, -					unsigned int mss_now, unsigned int cwnd) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, +				const struct sk_buff *skb)  { -	struct tcp_sock *tp = tcp_sk(sk); -	u32 needed, window, cwnd_len; +	if (skb->len < tcp_skb_pcount(skb) * mss_now) +		tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + *    With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, +			    int nonagle) +{ +	return partial && +		((nonagle & TCP_NAGLE_CORK) || +		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, +					const struct sk_buff *skb, +					unsigned int mss_now, +					unsigned int max_segs, +					int nonagle) +{ +	const struct tcp_sock *tp = tcp_sk(sk); +	u32 partial, needed, window, max_len;  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; -	cwnd_len = mss_now * cwnd; +	max_len = mss_now * max_segs; -	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) -		return cwnd_len; +	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) +		return max_len;  	needed = min(skb->len, window); -	if (cwnd_len <= needed) -		return cwnd_len; +	if (max_len <= needed) +		return max_len; + +	partial = needed % mss_now; +	/* If last segment is not a full MSS, check if Nagle rules allow us +	 * to include this last segment in this skb. +	 * Otherwise, we'll split the skb at last MSS boundary +	 */ +	if (tcp_nagle_check(partial != 0, tp, nonagle)) +		return needed - partial; -	return needed - needed % mss_now; +	return needed;  }  /* Can at least one segment of SKB be sent right now, according to the   * congestion window rules?  If so, return how many segments are allowed.   */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, -					 struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, +					 const struct sk_buff *skb)  {  	u32 in_flight, cwnd;  	/* Don't be strict about the congestion window for the final FIN.  */ -	if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) +	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && +	    tcp_skb_pcount(skb) == 1)  		return 1;  	in_flight = tcp_packets_in_flight(tp); @@ -1339,11 +1520,11 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,  	return 0;  } -/* Intialize TSO state of a skb. +/* Initialize TSO state of a skb.   * This must be invoked the first time we consider transmitting   * SKB onto the wire.   */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, +static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,  			     unsigned int mss_now)  {  	int tso_segs = tcp_skb_pcount(skb); @@ -1355,34 +1536,12 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,  	return tso_segs;  } -/* Minshall's variant of the Nagle send check. */ -static inline int tcp_minshall_check(const struct tcp_sock *tp) -{ -	return after(tp->snd_sml, tp->snd_una) && -		!after(tp->snd_sml, tp->snd_nxt); -} -/* Return 0, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - *    With Minshall's modification: all sent small packets are ACKed. - */ -static inline int tcp_nagle_check(const struct tcp_sock *tp, -				  const struct sk_buff *skb, -				  unsigned mss_now, int nonagle) -{ -	return skb->len < mss_now && -		((nonagle & TCP_NAGLE_CORK) || -		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} - -/* Return non-zero if the Nagle test allows this packet to be +/* Return true if the Nagle test allows this packet to be   * sent now.   */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, -				 unsigned int cur_mss, int nonagle) +static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, +				  unsigned int cur_mss, int nonagle)  {  	/* Nagle rule does not apply to frames, which sit in the middle of the  	 * write_queue (they have no chances to get new data). @@ -1391,24 +1550,22 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,  	 * argument based upon the location of SKB in the send queue.  	 */  	if (nonagle & TCP_NAGLE_PUSH) -		return 1; +		return true; -	/* Don't use the nagle rule for urgent data (or for the final FIN). -	 * Nagle can be ignored during F-RTO too (see RFC4138). -	 */ -	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || -	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) -		return 1; +	/* Don't use the nagle rule for urgent data (or for the final FIN). */ +	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) +		return true; -	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) -		return 1; +	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) +		return true; -	return 0; +	return false;  }  /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, -				   unsigned int cur_mss) +static bool tcp_snd_wnd_test(const struct tcp_sock *tp, +			     const struct sk_buff *skb, +			     unsigned int cur_mss)  {  	u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1422,10 +1579,10 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,   * should be put on the wire right now.  If so, it returns the number of   * packets allowed by the congestion window.   */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,  				 unsigned int cur_mss, int nonagle)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	unsigned int cwnd_quota;  	tcp_init_tso_segs(sk, skb, cur_mss); @@ -1441,9 +1598,9 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,  }  /* Test if sending is allowed right now. */ -int tcp_may_send_now(struct sock *sk) +bool tcp_may_send_now(struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb = tcp_send_head(sk);  	return skb && @@ -1468,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,  	/* All of a TSO frame must be composed of paged data.  */  	if (skb->len != skb->data_len) -		return tcp_fragment(sk, skb, len, mss_now); +		return tcp_fragment(sk, skb, len, mss_now, gfp);  	buff = sk_stream_alloc_skb(sk, 0, gfp);  	if (unlikely(buff == NULL)) @@ -1485,9 +1642,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;  	/* PSH and FIN should only be set in the second packet. */ -	flags = TCP_SKB_CB(skb)->flags; -	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); -	TCP_SKB_CB(buff)->flags = flags; +	flags = TCP_SKB_CB(skb)->tcp_flags; +	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); +	TCP_SKB_CB(buff)->tcp_flags = flags;  	/* This packet was never sent out yet, so no SACK bits. */  	TCP_SKB_CB(buff)->sacked = 0; @@ -1511,13 +1668,15 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,   *   * This algorithm is from John Heffner.   */ -static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, +				 bool *is_cwnd_limited)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk);  	u32 send_win, cong_win, limit, in_flight; +	int win_divisor; -	if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)  		goto send_now;  	if (icsk->icsk_ca_state != TCP_CA_Open) @@ -1540,20 +1699,22 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)  	limit = min(send_win, cong_win);  	/* If a full-sized TSO skb can be sent, do it. */ -	if (limit >= sk->sk_gso_max_size) +	if (limit >= min_t(unsigned int, sk->sk_gso_max_size, +			   tp->xmit_size_goal_segs * tp->mss_cache))  		goto send_now;  	/* Middle in queue won't get any more data, full sendable already? */  	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))  		goto send_now; -	if (sysctl_tcp_tso_win_divisor) { +	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); +	if (win_divisor) {  		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);  		/* If at least some fraction of a window is available,  		 * just use it.  		 */ -		chunk /= sysctl_tcp_tso_win_divisor; +		chunk /= win_divisor;  		if (limit >= chunk)  			goto send_now;  	} else { @@ -1562,18 +1723,24 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)  		 * frame, so if we have space for more than 3 frames  		 * then send now.  		 */ -		if (limit > tcp_max_burst(tp) * tp->mss_cache) +		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)  			goto send_now;  	} -	/* Ok, it looks like it is advisable to defer.  */ -	tp->tso_deferred = 1 | (jiffies << 1); +	/* Ok, it looks like it is advisable to defer. +	 * Do not rearm the timer if already set to not break TCP ACK clocking. +	 */ +	if (!tp->tso_deferred) +		tp->tso_deferred = 1 | (jiffies << 1); + +	if (cong_win < send_win && cong_win < skb->len) +		*is_cwnd_limited = true; -	return 1; +	return true;  send_now:  	tp->tso_deferred = 0; -	return 0; +	return false;  }  /* Create a new MTU probe if we are ready. @@ -1643,7 +1810,7 @@ static int tcp_mtu_probe(struct sock *sk)  	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;  	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; -	TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; +	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;  	TCP_SKB_CB(nskb)->sacked = 0;  	nskb->csum = 0;  	nskb->ip_summed = skb->ip_summed; @@ -1663,11 +1830,11 @@ static int tcp_mtu_probe(struct sock *sk)  		if (skb->len <= copy) {  			/* We've eaten all the data from this skb.  			 * Throw it away. */ -			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; +			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;  			tcp_unlink_write_queue(skb, sk);  			sk_wmem_free_skb(sk, skb);  		} else { -			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & +			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &  						   ~(TCPHDR_FIN|TCPHDR_PSH);  			if (!skb_shinfo(skb)->nr_frags) {  				skb_pull(skb, copy); @@ -1715,17 +1882,21 @@ static int tcp_mtu_probe(struct sock *sk)   * snd_up-64k-mss .. snd_up cannot be large. However, taking into   * account rare use of URG, this is not a big flaw.   * - * Returns 1, if no segments are in flight and we have queued segments, but - * cannot send anything now because of SWS or another problem. + * Send at most one packet when push_one > 0. Temporarily ignore + * cwnd limit to force at most one packet out when push_one == 2. + + * Returns true, if no segments are in flight and we have queued segments, + * but cannot send anything now because of SWS or another problem.   */ -static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -			  int push_one, gfp_t gfp) +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +			   int push_one, gfp_t gfp)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb;  	unsigned int tso_segs, sent_pkts;  	int cwnd_quota;  	int result; +	bool is_cwnd_limited = false;  	sent_pkts = 0; @@ -1733,7 +1904,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		/* Do MTU probing. */  		result = tcp_mtu_probe(sk);  		if (!result) { -			return 0; +			return false;  		} else if (result > 0) {  			sent_pkts = 1;  		} @@ -1745,9 +1916,18 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);  		BUG_ON(!tso_segs); +		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) +			goto repair; /* Skip network transmission */ +  		cwnd_quota = tcp_cwnd_test(tp, skb); -		if (!cwnd_quota) -			break; +		if (!cwnd_quota) { +			is_cwnd_limited = true; +			if (push_one == 2) +				/* Force out a loss probe pkt. */ +				cwnd_quota = 1; +			else +				break; +		}  		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))  			break; @@ -1758,14 +1938,42 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  						      nonagle : TCP_NAGLE_PUSH))))  				break;  		} else { -			if (!push_one && tcp_tso_should_defer(sk, skb)) +			if (!push_one && +			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) +				break; +		} + +		/* TCP Small Queues : +		 * Control number of packets in qdisc/devices to two packets / or ~1 ms. +		 * This allows for : +		 *  - better RTT estimation and ACK scheduling +		 *  - faster recovery +		 *  - high rates +		 * Alas, some drivers / subsystems require a fair amount +		 * of queued bytes to ensure line rate. +		 * One example is wifi aggregation (802.11 AMPDU) +		 */ +		limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, +			      sk->sk_pacing_rate >> 10); + +		if (atomic_read(&sk->sk_wmem_alloc) > limit) { +			set_bit(TSQ_THROTTLED, &tp->tsq_flags); +			/* It is possible TX completion already happened +			 * before we set TSQ_THROTTLED, so we must +			 * test again the condition. +			 */ +			smp_mb__after_atomic(); +			if (atomic_read(&sk->sk_wmem_alloc) > limit)  				break;  		}  		limit = mss_now;  		if (tso_segs > 1 && !tcp_urg_mode(tp))  			limit = tcp_mss_split_point(sk, skb, mss_now, -						    cwnd_quota); +						    min_t(unsigned int, +							  cwnd_quota, +							  sk->sk_gso_max_segs), +						    nonagle);  		if (skb->len > limit &&  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1776,23 +1984,165 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))  			break; +repair:  		/* Advance the send_head.  This one is sent out.  		 * This call will increment packets_out.  		 */  		tcp_event_new_data_sent(sk, skb);  		tcp_minshall_update(tp, mss_now, skb); -		sent_pkts++; +		sent_pkts += tcp_skb_pcount(skb);  		if (push_one)  			break;  	}  	if (likely(sent_pkts)) { -		tcp_cwnd_validate(sk); -		return 0; +		if (tcp_in_cwnd_reduction(sk)) +			tp->prr_out += sent_pkts; + +		/* Send one loss probe per tail loss episode. */ +		if (push_one != 2) +			tcp_schedule_loss_probe(sk); +		tcp_cwnd_validate(sk, is_cwnd_limited); +		return false;  	} -	return !tp->packets_out && tcp_send_head(sk); +	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); +} + +bool tcp_schedule_loss_probe(struct sock *sk) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	u32 timeout, tlp_time_stamp, rto_time_stamp; +	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + +	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) +		return false; +	/* No consecutive loss probes. */ +	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { +		tcp_rearm_rto(sk); +		return false; +	} +	/* Don't do any loss probe on a Fast Open connection before 3WHS +	 * finishes. +	 */ +	if (sk->sk_state == TCP_SYN_RECV) +		return false; + +	/* TLP is only scheduled when next timer event is RTO. */ +	if (icsk->icsk_pending != ICSK_TIME_RETRANS) +		return false; + +	/* Schedule a loss probe in 2*RTT for SACK capable connections +	 * in Open state, that are either limited by cwnd or application. +	 */ +	if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || +	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) +		return false; + +	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && +	     tcp_send_head(sk)) +		return false; + +	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account +	 * for delayed ack when there's one outstanding packet. +	 */ +	timeout = rtt << 1; +	if (tp->packets_out == 1) +		timeout = max_t(u32, timeout, +				(rtt + (rtt >> 1) + TCP_DELACK_MAX)); +	timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + +	/* If RTO is shorter, just schedule TLP in its place. */ +	tlp_time_stamp = tcp_time_stamp + timeout; +	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; +	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { +		s32 delta = rto_time_stamp - tcp_time_stamp; +		if (delta > 0) +			timeout = delta; +	} + +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, +				  TCP_RTO_MAX); +	return true; +} + +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, +				    const struct sk_buff *skb) +{ +	const struct sk_buff *fclone = skb + 1; + +	if (unlikely(skb->fclone == SKB_FCLONE_ORIG && +		     fclone->fclone == SKB_FCLONE_CLONE)) { +		NET_INC_STATS_BH(sock_net(sk), +				 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); +		return true; +	} +	return false; +} + +/* When probe timeout (PTO) fires, send a new segment if one exists, else + * retransmit the last segment. + */ +void tcp_send_loss_probe(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; +	int pcount; +	int mss = tcp_current_mss(sk); +	int err = -1; + +	if (tcp_send_head(sk) != NULL) { +		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); +		goto rearm_timer; +	} + +	/* At most one outstanding TLP retransmission. */ +	if (tp->tlp_high_seq) +		goto rearm_timer; + +	/* Retransmit last segment. */ +	skb = tcp_write_queue_tail(sk); +	if (WARN_ON(!skb)) +		goto rearm_timer; + +	if (skb_still_in_host_queue(sk, skb)) +		goto rearm_timer; + +	pcount = tcp_skb_pcount(skb); +	if (WARN_ON(!pcount)) +		goto rearm_timer; + +	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { +		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, +					  GFP_ATOMIC))) +			goto rearm_timer; +		skb = tcp_write_queue_tail(sk); +	} + +	if (WARN_ON(!skb || !tcp_skb_pcount(skb))) +		goto rearm_timer; + +	err = __tcp_retransmit_skb(sk, skb); + +	/* Record snd_nxt for loss detection. */ +	if (likely(!err)) +		tp->tlp_high_seq = tp->snd_nxt; + +rearm_timer: +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, +				  inet_csk(sk)->icsk_rto, +				  TCP_RTO_MAX); + +	if (likely(!err)) +		NET_INC_STATS_BH(sock_net(sk), +				 LINUX_MIB_TCPLOSSPROBES);  }  /* Push out any pending frames which were held back due to @@ -1809,7 +2159,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,  	if (unlikely(sk->sk_state == TCP_CLOSE))  		return; -	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) +	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, +			   sk_gfp_atomic(sk, GFP_ATOMIC)))  		tcp_check_probe_timer(sk);  } @@ -1889,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk)  	 */  	int mss = icsk->icsk_ack.rcv_mss;  	int free_space = tcp_space(sk); -	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); +	int allowed_space = tcp_full_space(sk); +	int full_space = min_t(int, tp->window_clamp, allowed_space);  	int window;  	if (mss > full_space) @@ -1898,11 +2250,23 @@ u32 __tcp_select_window(struct sock *sk)  	if (free_space < (full_space >> 1)) {  		icsk->icsk_ack.quick = 0; -		if (tcp_memory_pressure) +		if (sk_under_memory_pressure(sk))  			tp->rcv_ssthresh = min(tp->rcv_ssthresh,  					       4U * tp->advmss); -		if (free_space < mss) +		/* free_space might become our new window, make sure we don't +		 * increase it due to wscale. +		 */ +		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + +		/* if free space is less than mss estimate, or is below 1/16th +		 * of the maximum allowed, try to move to zero-window, else +		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and +		 * new incoming data is dropped due to memory limits. +		 * With large window, mss test triggers way too late in order +		 * to announce zero window in time before rmem limit kicks in. +		 */ +		if (free_space < (allowed_space >> 4) || free_space < mss)  			return 0;  	} @@ -1971,7 +2335,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;  	/* Merge over control information. This moves PSH/FIN etc. over */ -	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; +	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;  	/* All done, get rid of second SKB and account for it so  	 * packet counting does not break. @@ -1989,22 +2353,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)  }  /* Check if coalescing SKBs is legal. */ -static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)  {  	if (tcp_skb_pcount(skb) > 1) -		return 0; +		return false;  	/* TODO: SACK collapsing could be used to remove this condition */  	if (skb_shinfo(skb)->nr_frags != 0) -		return 0; +		return false;  	if (skb_cloned(skb)) -		return 0; +		return false;  	if (skb == tcp_send_head(sk)) -		return 0; +		return false;  	/* Some heurestics for collapsing over SACK'd could be invented */  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) -		return 0; +		return false; -	return 1; +	return true;  }  /* Collapse packets in the retransmit queue to make to create @@ -2015,11 +2379,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb = to, *tmp; -	int first = 1; +	bool first = true;  	if (!sysctl_tcp_retrans_collapse)  		return; -	if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)  		return;  	tcp_for_write_queue_from_safe(skb, tmp, sk) { @@ -2029,7 +2393,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  		space -= skb->len;  		if (first) { -			first = 0; +			first = false;  			continue;  		} @@ -2038,7 +2402,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  		/* Punt if not enough space exists in the first SKB for  		 * the data in the second  		 */ -		if (skb->len > skb_tailroom(to)) +		if (skb->len > skb_availroom(to))  			break;  		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) @@ -2052,7 +2416,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,   * state updates are done by the caller.  Returns non-zero if an   * error occurred which prevented the send.   */ -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); @@ -2071,6 +2435,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))  		return -EAGAIN; +	if (skb_still_in_host_queue(sk, skb)) +		return -EBUSY; +  	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {  		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))  			BUG(); @@ -2093,12 +2460,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		return -EAGAIN;  	if (skb->len > cur_mss) { -		if (tcp_fragment(sk, skb, cur_mss, cur_mss)) +		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))  			return -ENOMEM; /* We'll try again later. */  	} else {  		int oldpcount = tcp_skb_pcount(skb);  		if (unlikely(oldpcount > 1)) { +			if (skb_unclone(skb, GFP_ATOMIC)) +				return -ENOMEM;  			tcp_init_tso_segs(sk, skb, cur_mss);  			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));  		} @@ -2106,38 +2475,45 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	tcp_retrans_try_collapse(sk, skb, cur_mss); -	/* Some Solaris stacks overoptimize and ignore the FIN on a -	 * retransmit when old data is attached.  So strip it off -	 * since it is cheap to do so and saves bytes on the network. -	 */ -	if (skb->len > 0 && -	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && -	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { -		if (!pskb_trim(skb, 0)) { -			/* Reuse, even though it does some unnecessary work */ -			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, -					     TCP_SKB_CB(skb)->flags); -			skb->ip_summed = CHECKSUM_NONE; -		} -	} -  	/* Make a copy, if the first transmission SKB clone we made  	 * is still in somebody's hands, else make a clone.  	 */  	TCP_SKB_CB(skb)->when = tcp_time_stamp; -	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +	/* make sure skb->data is aligned on arches that require it +	 * and check if ack-trimming & collapsing extended the headroom +	 * beyond what csum_start can cover. +	 */ +	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || +		     skb_headroom(skb) >= 0xFFFF)) { +		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, +						   GFP_ATOMIC); +		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : +			     -ENOBUFS; +	} else { +		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +	} -	if (err == 0) { +	if (likely(!err)) { +		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;  		/* Update global TCP statistics. */  		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - +		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) +			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);  		tp->total_retrans++; +	} +	return err; +} + +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int err = __tcp_retransmit_skb(sk, skb); +	if (err == 0) {  #if FASTRETRANS_DEBUG > 0  		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { -			if (net_ratelimit()) -				printk(KERN_DEBUG "retrans_out leaked.\n"); +			net_dbg_ratelimited("retrans_out leaked\n");  		}  #endif  		if (!tp->retrans_out) @@ -2149,31 +2525,35 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		if (!tp->retrans_stamp)  			tp->retrans_stamp = TCP_SKB_CB(skb)->when; -		tp->undo_retrans++; -  		/* snd_nxt is stored to detect loss of retransmitted segment,  		 * see tcp_input.c tcp_sacktag_write_queue().  		 */  		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; +	} else if (err != -EBUSY) { +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);  	} + +	if (tp->undo_retrans < 0) +		tp->undo_retrans = 0; +	tp->undo_retrans += tcp_skb_pcount(skb);  	return err;  }  /* Check if we forward retransmits are possible in the current   * window/congestion state.   */ -static int tcp_can_forward_retransmit(struct sock *sk) +static bool tcp_can_forward_retransmit(struct sock *sk)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	/* Forward retransmissions are possible only during Recovery. */  	if (icsk->icsk_ca_state != TCP_CA_Recovery) -		return 0; +		return false;  	/* No forward retransmissions in Reno are possible. */  	if (tcp_is_reno(tp)) -		return 0; +		return false;  	/* Yeah, we have to make difficult choice between forward transmission  	 * and retransmission... Both ways have their merits... @@ -2184,9 +2564,9 @@ static int tcp_can_forward_retransmit(struct sock *sk)  	 */  	if (tcp_may_send_now(sk)) -		return 0; +		return false; -	return 1; +	return true;  }  /* This gets called after a retransmit timeout, and the initially @@ -2278,8 +2658,12 @@ begin_fwd:  		if (tcp_retransmit_skb(sk, skb))  			return; +  		NET_INC_STATS_BH(sock_net(sk), mib_idx); +		if (tcp_in_cwnd_reduction(sk)) +			tp->prr_out += tcp_skb_pcount(skb); +  		if (skb == tcp_write_queue_head(sk))  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,  						  inet_csk(sk)->icsk_rto, @@ -2303,7 +2687,7 @@ void tcp_send_fin(struct sock *sk)  	mss_now = tcp_current_mss(sk);  	if (tcp_send_head(sk) != NULL) { -		TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;  		TCP_SKB_CB(skb)->end_seq++;  		tp->write_seq++;  	} else { @@ -2365,11 +2749,11 @@ int tcp_send_synack(struct sock *sk)  	struct sk_buff *skb;  	skb = tcp_write_queue_head(sk); -	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { -		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); +	if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { +		pr_debug("%s: wrong queue state\n", __func__);  		return -EFAULT;  	} -	if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { +	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {  		if (skb_cloned(skb)) {  			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);  			if (nskb == NULL) @@ -2383,66 +2767,50 @@ int tcp_send_synack(struct sock *sk)  			skb = nskb;  		} -		TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;  		TCP_ECN_send_synack(tcp_sk(sk), skb);  	}  	TCP_SKB_CB(skb)->when = tcp_time_stamp;  	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);  } -/* Prepare a SYN-ACK. */ +/** + * tcp_make_synack - Prepare a SYN-ACK. + * sk: listener socket + * dst: dst entry attached to the SYNACK + * req: request_sock pointer + * + * Allocate one skb and build a SYNACK packet. + * @dst is consumed : Caller should not use it again. + */  struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  				struct request_sock *req, -				struct request_values *rvp) +				struct tcp_fastopen_cookie *foc)  {  	struct tcp_out_options opts; -	struct tcp_extend_values *xvp = tcp_xv(rvp);  	struct inet_request_sock *ireq = inet_rsk(req);  	struct tcp_sock *tp = tcp_sk(sk); -	const struct tcp_cookie_values *cvp = tp->cookie_values;  	struct tcphdr *th;  	struct sk_buff *skb;  	struct tcp_md5sig_key *md5;  	int tcp_header_size;  	int mss; -	int s_data_desired = 0; -	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) -		s_data_desired = cvp->s_data_desired; -	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); -	if (skb == NULL) +	skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); +	if (unlikely(!skb)) { +		dst_release(dst);  		return NULL; - +	}  	/* Reserve space for headers. */  	skb_reserve(skb, MAX_TCP_HEADER); -	skb_dst_set(skb, dst_clone(dst)); +	skb_dst_set(skb, dst); +	security_skb_owned_by(skb, sk); -	mss = dst_metric(dst, RTAX_ADVMSS); +	mss = dst_metric_advmss(dst);  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)  		mss = tp->rx_opt.user_mss; -	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ -		__u8 rcv_wscale; -		/* Set this up on the first call only */ -		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - -		/* limit the window selection if the user enforce a smaller rx buffer */ -		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && -		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) -			req->window_clamp = tcp_full_space(sk); - -		/* tcp_full_space because it is guaranteed to be the first packet */ -		tcp_select_initial_window(tcp_full_space(sk), -			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), -			&req->rcv_wnd, -			&req->window_clamp, -			ireq->wscale_ok, -			&rcv_wscale, -			dst_metric(dst, RTAX_INITRWND)); -		ireq->rcv_wscale = rcv_wscale; -	} -  	memset(&opts, 0, sizeof(opts));  #ifdef CONFIG_SYN_COOKIES  	if (unlikely(req->cookie_ts)) @@ -2450,9 +2818,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	else  #endif  	TCP_SKB_CB(skb)->when = tcp_time_stamp; -	tcp_header_size = tcp_synack_options(sk, req, mss, -					     skb, &opts, &md5, xvp) -			+ sizeof(*th); +	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, +					     foc) + sizeof(*th);  	skb_push(skb, tcp_header_size);  	skb_reset_transport_header(skb); @@ -2462,56 +2829,23 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	th->syn = 1;  	th->ack = 1;  	TCP_ECN_make_synack(req, th); -	th->source = ireq->loc_port; -	th->dest = ireq->rmt_port; +	th->source = htons(ireq->ir_num); +	th->dest = ireq->ir_rmt_port;  	/* Setting of flags are superfluous here for callers (and ECE is  	 * not even correctly set)  	 */  	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,  			     TCPHDR_SYN | TCPHDR_ACK); -	if (OPTION_COOKIE_EXTENSION & opts.options) { -		if (s_data_desired) { -			u8 *buf = skb_put(skb, s_data_desired); - -			/* copy data directly from the listening socket. */ -			memcpy(buf, cvp->s_data_payload, s_data_desired); -			TCP_SKB_CB(skb)->end_seq += s_data_desired; -		} - -		if (opts.hash_size > 0) { -			__u32 workspace[SHA_WORKSPACE_WORDS]; -			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; -			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; - -			/* Secret recipe depends on the Timestamp, (future) -			 * Sequence and Acknowledgment Numbers, Initiator -			 * Cookie, and others handled by IP variant caller. -			 */ -			*tail-- ^= opts.tsval; -			*tail-- ^= tcp_rsk(req)->rcv_isn + 1; -			*tail-- ^= TCP_SKB_CB(skb)->seq + 1; - -			/* recommended */ -			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); -			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ - -			sha_transform((__u32 *)&xvp->cookie_bakery[0], -				      (char *)mess, -				      &workspace[0]); -			opts.hash_location = -				(__u8 *)&xvp->cookie_bakery[0]; -		} -	} -  	th->seq = htonl(TCP_SKB_CB(skb)->seq); -	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); +	/* XXX data is queued and acked as is. No buffer/window check */ +	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);  	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */  	th->window = htons(min(req->rcv_wnd, 65535U));  	tcp_options_write((__be32 *)(th + 1), tp, &opts);  	th->doff = (tcp_header_size >> 2); -	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); +	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);  #ifdef CONFIG_TCP_MD5SIG  	/* Okay, we have all we need - do the md5 hash if needed */ @@ -2528,7 +2862,7 @@ EXPORT_SYMBOL(tcp_make_synack);  /* Do all connect socket setups that can be done AF independent. */  static void tcp_connect_init(struct sock *sk)  { -	struct dst_entry *dst = __sk_dst_get(sk); +	const struct dst_entry *dst = __sk_dst_get(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	__u8 rcv_wscale; @@ -2552,7 +2886,7 @@ static void tcp_connect_init(struct sock *sk)  	if (!tp->window_clamp)  		tp->window_clamp = dst_metric(dst, RTAX_WINDOW); -	tp->advmss = dst_metric(dst, RTAX_ADVMSS); +	tp->advmss = dst_metric_advmss(dst);  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)  		tp->advmss = tp->rx_opt.user_mss; @@ -2581,15 +2915,134 @@ static void tcp_connect_init(struct sock *sk)  	tp->snd_una = tp->write_seq;  	tp->snd_sml = tp->write_seq;  	tp->snd_up = tp->write_seq; -	tp->rcv_nxt = 0; -	tp->rcv_wup = 0; -	tp->copied_seq = 0; +	tp->snd_nxt = tp->write_seq; + +	if (likely(!tp->repair)) +		tp->rcv_nxt = 0; +	else +		tp->rcv_tstamp = tcp_time_stamp; +	tp->rcv_wup = tp->rcv_nxt; +	tp->copied_seq = tp->rcv_nxt;  	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;  	inet_csk(sk)->icsk_retransmits = 0;  	tcp_clear_retrans(tp);  } +static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + +	tcb->end_seq += skb->len; +	skb_header_release(skb); +	__tcp_add_write_queue_tail(sk, skb); +	sk->sk_wmem_queued += skb->truesize; +	sk_mem_charge(sk, skb->truesize); +	tp->write_seq = tcb->end_seq; +	tp->packets_out += tcp_skb_pcount(skb); +} + +/* Build and send a SYN with data and (cached) Fast Open cookie. However, + * queue a data-only packet after the regular SYN, such that regular SYNs + * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges + * only the SYN sequence, the data are retransmitted in the first ACK. + * If cookie is not cached or other error occurs, falls back to send a + * regular SYN with Fast Open cookie request option. + */ +static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_fastopen_request *fo = tp->fastopen_req; +	int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; +	struct sk_buff *syn_data = NULL, *data; +	unsigned long last_syn_loss = 0; + +	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */ +	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, +			       &syn_loss, &last_syn_loss); +	/* Recurring FO SYN losses: revert to regular handshake temporarily */ +	if (syn_loss > 1 && +	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { +		fo->cookie.len = -1; +		goto fallback; +	} + +	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) +		fo->cookie.len = -1; +	else if (fo->cookie.len <= 0) +		goto fallback; + +	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and +	 * user-MSS. Reserve maximum option space for middleboxes that add +	 * private TCP options. The cost is reduced data space in SYN :( +	 */ +	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) +		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; +	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - +		MAX_TCP_OPTION_SPACE; + +	space = min_t(size_t, space, fo->size); + +	/* limit to order-0 allocations */ +	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + +	syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, +				   sk->sk_allocation); +	if (syn_data == NULL) +		goto fallback; + +	for (i = 0; i < iovlen && syn_data->len < space; ++i) { +		struct iovec *iov = &fo->data->msg_iov[i]; +		unsigned char __user *from = iov->iov_base; +		int len = iov->iov_len; + +		if (syn_data->len + len > space) +			len = space - syn_data->len; +		else if (i + 1 == iovlen) +			/* No more data pending in inet_wait_for_connect() */ +			fo->data = NULL; + +		if (skb_add_data(syn_data, from, len)) +			goto fallback; +	} + +	/* Queue a data-only packet after the regular SYN for retransmission */ +	data = pskb_copy(syn_data, sk->sk_allocation); +	if (data == NULL) +		goto fallback; +	TCP_SKB_CB(data)->seq++; +	TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; +	TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); +	tcp_connect_queue_skb(sk, data); +	fo->copied = data->len; + +	/* syn_data is about to be sent, we need to take current time stamps +	 * for the packets that are in write queue : SYN packet and DATA +	 */ +	skb_mstamp_get(&syn->skb_mstamp); +	data->skb_mstamp = syn->skb_mstamp; + +	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { +		tp->syn_data = (fo->copied > 0); +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); +		goto done; +	} +	syn_data = NULL; + +fallback: +	/* Send a regular SYN with Fast Open cookie request option */ +	if (fo->cookie.len > 0) +		fo->cookie.len = 0; +	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); +	if (err) +		tp->syn_fastopen = 0; +	kfree_skb(syn_data); +done: +	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */ +	return err; +} +  /* Build a SYN and send it off. */  int tcp_connect(struct sock *sk)  { @@ -2599,6 +3052,11 @@ int tcp_connect(struct sock *sk)  	tcp_connect_init(sk); +	if (unlikely(tp->repair)) { +		tcp_finish_connect(sk, NULL); +		return 0; +	} +  	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);  	if (unlikely(buff == NULL))  		return -ENOBUFS; @@ -2606,19 +3064,14 @@ int tcp_connect(struct sock *sk)  	/* Reserve space for headers. */  	skb_reserve(buff, MAX_TCP_HEADER); -	tp->snd_nxt = tp->write_seq;  	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); +	tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; +	tcp_connect_queue_skb(sk, buff);  	TCP_ECN_send_syn(sk, buff); -	/* Send it off. */ -	TCP_SKB_CB(buff)->when = tcp_time_stamp; -	tp->retrans_stamp = TCP_SKB_CB(buff)->when; -	skb_header_release(buff); -	__tcp_add_write_queue_tail(sk, buff); -	sk->sk_wmem_queued += buff->truesize; -	sk_mem_charge(sk, buff->truesize); -	tp->packets_out += tcp_skb_pcount(buff); -	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); +	/* Send off SYN; include data in Fast Open. */ +	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : +	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);  	if (err == -ECONNREFUSED)  		return err; @@ -2660,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk)  		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements  		 * directly.  		 */ -		if (tp->srtt) { -			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); +		if (tp->srtt_us) { +			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), +					TCP_DELACK_MIN);  			if (rtt < max_ato)  				max_ato = rtt; @@ -2705,7 +3159,7 @@ void tcp_send_ack(struct sock *sk)  	 * tcp_transmit_skb() will set the ownership to this  	 * sock.  	 */ -	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); +	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));  	if (buff == NULL) {  		inet_csk_schedule_ack(sk);  		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; @@ -2720,7 +3174,7 @@ void tcp_send_ack(struct sock *sk)  	/* Send it off, this clears delayed acks for us. */  	TCP_SKB_CB(buff)->when = tcp_time_stamp; -	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); +	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));  }  /* This routine sends a packet with an out of date sequence @@ -2740,7 +3194,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)  	struct sk_buff *skb;  	/* We don't queue it, tcp_transmit_skb() sets ownership. */ -	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); +	skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));  	if (skb == NULL)  		return -1; @@ -2755,6 +3209,14 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)  	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);  } +void tcp_send_window_probe(struct sock *sk) +{ +	if (sk->sk_state == TCP_ESTABLISHED) { +		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; +		tcp_xmit_probe_skb(sk, 0); +	} +} +  /* Initiate keepalive or window probe from timer. */  int tcp_write_wakeup(struct sock *sk)  { @@ -2780,13 +3242,13 @@ int tcp_write_wakeup(struct sock *sk)  		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||  		    skb->len > mss) {  			seg_size = min(seg_size, mss); -			TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; -			if (tcp_fragment(sk, skb, seg_size, mss)) +			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; +			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))  				return -1;  		} else if (!tcp_skb_pcount(skb))  			tcp_set_skb_tso_segs(sk, skb, mss); -		TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;  		TCP_SKB_CB(skb)->when = tcp_time_stamp;  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);  		if (!err)  | 
